4 files changed, 0 insertions, 3807 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
deleted file mode 100644
index 7561846a3..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
+++ /dev/null
@@ -1,1521 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = hipBlockIdx_x;
-  const Index n_block_idx = hipBlockIdx_y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in hipThreadIdx_x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z;
-  const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // hipThreadIdx_x: the vertical position in an 8x8 block
-  // hipThreadIdx_y: the vertical index of the 8x8 block in the grid
-  // hipThreadIdx_z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y];
-    const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (hipThreadIdx_x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8);
-
-  if (hipThreadIdx_x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0];
-      Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1];
-      Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2];
-      Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3];
-      Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4];
-      Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5];
-      Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6];
-      Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7];
-
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6;
-      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j];
-        output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512, 1)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = hipBlockIdx_x;
-  const Index n_block_idx = hipBlockIdx_y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        /*reg = lhs.template loadPacket<Unaligned>(row, col);*/     \
-        reg.x =lhs(row + 0, col);                     \
-        reg.y =lhs(row + 1, col);                     \
-        reg.z =lhs(row + 2, col);                     \
-        reg.w =lhs(row + 3, col);                     \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          /*reg =lhs.template loadPacket<Unaligned>(row, col);*/   \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-          reg.w =lhs(row + 3, col);                   \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
-
-
-  Index lhs_vert = base_m+hipThreadIdx_x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-    //lhs_pf0 = internal::pset1<float4>(0);
-    //rhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf0 = make_float4(0, 0, 0, 0);
-    rhs_pf0 = make_float4(0, 0, 0, 0);
-
-    Index lhs_horiz = hipThreadIdx_y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(hipThreadIdx_x%4)*4;
-    Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((hipThreadIdx_x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    if((hipThreadIdx_x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x];
-      float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x];
-
-      int start_feature = hipThreadIdx_y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = hipThreadIdx_y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-  Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    /*lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);*/
-
-
-    lhs_pf0 = make_float4(0, 0, 0, 0);
-    lhs_pf1 = make_float4(0, 0, 0, 0);
-    lhs_pf2 = make_float4(0, 0, 0, 0);
-    lhs_pf3 = make_float4(0, 0, 0, 0);
-
-    rhs_pf0 = make_float4(0, 0, 0, 0);
-    rhs_pf1 = make_float4(0, 0, 0, 0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((hipThreadIdx_y/4+k+24) < k_size) {
-        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-        //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
-        //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
-        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-        lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
-        lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
-        lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
-        lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
-      } else if ((hipThreadIdx_y/4+k+16) < k_size) {
-        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-        //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
-        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-        lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-      } else if ((hipThreadIdx_y/4+k+8) < k_size) {
-        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-      } else if ((hipThreadIdx_y/4+k) < k_size) {
-        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((hipThreadIdx_y/4+k+24) < k_size) {
-          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-          //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
-          //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
-        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
-          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-          //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
-          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        } else if ((hipThreadIdx_y/4+k) < k_size) {
-          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((hipThreadIdx_y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
-        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
-        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
-        } else if ((hipThreadIdx_y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((hipThreadIdx_y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
-        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
-        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
-        } else if ((hipThreadIdx_y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((hipThreadIdx_y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
-        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
-        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
-        } else if ((hipThreadIdx_y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+hipThreadIdx_x*4;
-    Index rhs_horiz0 = hipThreadIdx_y*2+base_n;
-    Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-          rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+hipThreadIdx_x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+hipThreadIdx_x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
-
-      // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8.
-      int start_feature = (hipThreadIdx_y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-
-  __syncthreads();
-  Index horiz_base = (hipThreadIdx_y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256, 1)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
-  const Index m_block_idx = hipBlockIdx_x;
-  const Index n_block_idx = hipBlockIdx_y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256, 1)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = hipBlockIdx_x;
-  const Index n_block_idx = hipBlockIdx_y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
- 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
-  // We need to redefine this method to make hipcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>),
-                    dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>),
-                        dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>),
-                        dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    setHipSharedMemConfig(hipSharedMemBankSizeEightByte);
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and EIGEN_HIPCC
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
deleted file mode 100644
index ba9971050..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
+++ /dev/null
@@ -1,1119 +0,0 @@
-//#include "hip/hip_runtime.h"
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor convolution class.
-  *
-  *
-  */
-namespace internal {
-
-template <typename Index, typename InputDims, int NumKernelDims, int Layout>
-class IndexMapper {
- public:
-  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
-              const array<Index, NumKernelDims>& indices) {
-
-    array<Index, NumDims> dimensions = input_dims;
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = indices[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      dimensions[index] = result_dim;
-    }
-
-    array<Index, NumDims> inputStrides;
-    array<Index, NumDims> outputStrides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputStrides[0] = 1;
-      outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
-        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
-      }
-    } else {
-      inputStrides[NumDims - 1] = 1;
-      outputStrides[NumDims - 1] = 1;
-      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
-        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
-        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
-      }
-    }
-
-    array<Index, NumDims> hipInputDimensions;
-    array<Index, NumDims> hipOutputDimensions;
-    array<Index, NumDims> tmp = dimensions;
-    array<Index, NumDims> ordering;
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = i + offset;
-      ordering[index] = indices[i];
-      tmp[indices[i]] = -1;
-      hipInputDimensions[index] = input_dims[indices[i]];
-      hipOutputDimensions[index] = dimensions[indices[i]];
-    }
-
-    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                      ? NumKernelDims
-                      : 0;
-    for (int i = 0; i < NumDims; ++i) {
-      if (tmp[i] >= 0) {
-        ordering[written] = i;
-        hipInputDimensions[written] = input_dims[i];
-        hipOutputDimensions[written] = dimensions[i];
-        ++written;
-      }
-    }
-
-    for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = inputStrides[ordering[i]];
-      m_outputStrides[i] = outputStrides[ordering[i]];
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumDims; ++i) {
-        if (i > NumKernelDims) {
-          m_hipInputStrides[i] =
-              m_hipInputStrides[i - 1] * hipInputDimensions[i - 1];
-          m_hipOutputStrides[i] =
-              m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1];
-        } else {
-          m_hipInputStrides[i] = 1;
-          m_hipOutputStrides[i] = 1;
-        }
-      }
-    } else {
-      for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < offset) {
-          m_hipInputStrides[i] =
-              m_hipInputStrides[i + 1] * hipInputDimensions[i + 1];
-          m_hipOutputStrides[i] =
-              m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1];
-        } else {
-          m_hipInputStrides[i] = 1;
-          m_hipOutputStrides[i] = 1;
-        }
-      }
-    }
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_hipInputStrides[d];
-        inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_hipInputStrides[d];
-      }
-      inputIndex += p * m_inputStrides[NumKernelDims];
-    } else {
-      std::ptrdiff_t limit = 0;
-      if (NumKernelDims < NumDims) {
-        limit = NumDims - NumKernelDims - 1;
-      }
-      for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_hipInputStrides[d];
-        inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_hipInputStrides[d];
-      }
-      inputIndex += p * m_inputStrides[limit];
-    }
-    return inputIndex;
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const {
-    Index outputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_hipOutputStrides[d];
-        outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_hipOutputStrides[d];
-      }
-      outputIndex += p * m_outputStrides[NumKernelDims];
-    } else {
-      std::ptrdiff_t limit = 0;
-      if (NumKernelDims < NumDims) {
-        limit = NumDims - NumKernelDims - 1;
-      }
-      for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_hipOutputStrides[d];
-        outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_hipOutputStrides[d];
-      }
-      outputIndex += p * m_outputStrides[limit];
-    }
-    return outputIndex;
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
-           k * m_inputStrides[offset + 2];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
-           k * m_outputStrides[offset + 2];
-  }
-
- private:
-  static const int NumDims = internal::array_size<InputDims>::value;
-  array<Index, NumDims> m_inputStrides;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_hipInputStrides;
-  array<Index, NumDims> m_hipOutputStrides;
-};
-
-
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename promote_storage_type<typename InputXprType::Scalar,
-                                        typename KernelXprType::Scalar>::ret Scalar;
-  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
-                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<InputXprType>::Index,
-                                      typename traits<KernelXprType>::Index>::type Index;
-  typedef typename InputXprType::Nested LhsNested;
-  typedef typename KernelXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = traits<InputXprType>::NumDimensions;
-  static const int Layout = traits<InputXprType>::Layout;
-
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
-{
-  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
-{
-  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
-                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
-      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Indices& indices() const { return m_indices; }
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename InputXprType::Nested>::type&
-    inputExpression() const { return m_input_xpr; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename KernelXprType::Nested>::type&
-    kernelExpression() const { return m_kernel_xpr; }
-
-  protected:
-    typename InputXprType::Nested m_input_xpr;
-    typename KernelXprType::Nested m_kernel_xpr;
-    const Indices m_indices;
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
-{
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<InputArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
-      }
-    } else {
-      m_inputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
-      }
-    }
-
-    m_dimensions = m_inputImpl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumKernelDims; ++i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i > 0) {
-          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
-        } else {
-          m_kernelStride[0] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      for (int i = NumKernelDims - 1; i >= 0; --i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i < NumKernelDims - 1) {
-          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
-        } else {
-          m_kernelStride[NumKernelDims - 1] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
-      }
-    }
-  }
- 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    preloadKernel();
-    return true;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-
-  void evalTo(typename XprType::Scalar* buffer) {
-    evalSubExprsIfNeeded(NULL);
-    for (int i = 0; i < dimensions().TotalSize(); ++i) {
-      buffer[i] += coeff(i);
-    }
-    cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    CoeffReturnType result = CoeffReturnType(0);
-    convolve(firstInput(index), 0, NumKernelDims-1, result);
-    return result;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
-  {
-    Index indices[2] = {index, index+PacketSize-1};
-    Index startInputs[2] = {0, 0};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / m_outputStride[i];
-        const Index idx1 = indices[1] / m_outputStride[i];
-        startInputs[0] += idx0 * m_inputStride[i];
-        startInputs[1] += idx1 * m_inputStride[i];
-        indices[0] -= idx0 * m_outputStride[i];
-        indices[1] -= idx1 * m_outputStride[i];
-      }
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / m_outputStride[i];
-        const Index idx1 = indices[1] / m_outputStride[i];
-        startInputs[0] += idx0 * m_inputStride[i];
-        startInputs[1] += idx1 * m_inputStride[i];
-        indices[0] -= idx0 * m_outputStride[i];
-        indices[1] -= idx1 * m_outputStride[i];
-      }
-    }
-    startInputs[0] += indices[0];
-    startInputs[1] += indices[1];
-
-    if (startInputs[1]-startInputs[0] == PacketSize-1) {
-      PacketReturnType result = internal::pset1<PacketReturnType>(0);
-      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
-      return result;
-    } else {
-      EIGEN_ALIGN_MAX Scalar data[PacketSize];
-      data[0] = Scalar(0);
-      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
-      for (int i = 1; i < PacketSize-1; ++i) {
-        data[i] = Scalar(0);
-        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
-      }
-      data[PacketSize-1] = Scalar(0);
-      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
-      return internal::pload<PacketReturnType>(data);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
-  }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
- private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
-    Index startInput = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStride[i];
-        startInput += idx * m_inputStride[i];
-        index -= idx * m_outputStride[i];
-      }
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStride[i];
-        startInput += idx * m_inputStride[i];
-        index -= idx * m_outputStride[i];
-      }
-    }
-    startInput += index;
-    return startInput;
-  }
-
-  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
-      if (DimIndex > 0) {
-        convolve(input, kernel, DimIndex-1, accum);
-      } else {
-        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
-      }
-    }
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
-      if (DimIndex > 0) {
-        convolvePacket(input, kernel, DimIndex-1, accum);
-      } else {
-        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
-
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  array<Index, NumDims> m_inputStride;
-  array<Index, NumDims> m_outputStride;
-
-  array<Index, NumKernelDims> m_indexStride;
-  array<Index, NumKernelDims> m_kernelStride;
-  TensorEvaluator<InputArgType, Device> m_inputImpl;
-  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
-  Dimensions m_dimensions;
-
-  KernelArgType m_kernelArg;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-  const Device& m_device;
-};
-
-
-
-
-// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-
-template <int StaticKernelSize>
-struct GetKernelSize {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
-    return StaticKernelSize;
-  }
-};
-template <>
-struct GetKernelSize<Dynamic> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
-    return kernelSize;
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
-          int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const int numPlanes, const int numX,
-    const int maxX, const int kernelSize, float* buffer) {
-  HIP_DYNAMIC_SHARED( float, s)
-
-  const int first_x = hipBlockIdx_x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
-  const int num_x_output = last_x - first_x + 1;
-
-  const int first_plane = hipBlockIdx_y * hipBlockDim_y;
-  const int plane_stride = hipBlockDim_y * hipGridDim_y;
-
-  for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) {
-    // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = hipThreadIdx_y * num_x_input;
-    #pragma unroll
-    for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x);
-      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
-    }
-
-    __syncthreads();
-
-    // Compute the convolution
-    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
-    #pragma unroll
-    for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
-      const int kernel_offset = plane_kernel_offset + i;
-      float result = 0.0f;
-      #pragma unroll
-      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
-        result += s[k + kernel_offset] * kernel[k];
-      }
-      const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x);
-      buffer[tensor_index] = result;
-    }
-    __syncthreads();
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
-          int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const int numPlanes, const int numX,
-    const int maxX, const int numY, const int maxY, const int kernelSizeX,
-    const int kernelSizeY, float* buffer) {
-  HIP_DYNAMIC_SHARED( float, s)
-
-  const int first_x = hipBlockIdx_x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
-  const int num_x_output = last_x - first_x + 1;
-
-  const int first_y = hipBlockIdx_y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
-  const int num_y_output = last_y - first_y + 1;
-
-  const int first_plane = hipBlockIdx_z * hipBlockDim_z;
-  const int plane_stride = hipBlockDim_z * hipGridDim_z;
-
-  for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) {
-
-    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = hipThreadIdx_z * num_y_input;
-
-    // Load inputs to shared memory
-    #pragma unroll
-    for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
-      const int input_offset = num_x_input * (j + plane_kernel_offset);
-      #pragma unroll
-      for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y);
-        s[i + input_offset] = eval.coeff(tensor_index);
-      }
-    }
-
-    __syncthreads();
-
-    // Convolution
-    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
-    #pragma unroll
-    for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
-      #pragma unroll
-      for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
-        float result = 0.0f;
-        #pragma unroll
-        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
-          const int kernel_offset = kernelSizeX * l;
-          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
-          #pragma unroll
-          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
-            result += s[k + input_offset] * kernel[k + kernel_offset];
-          }
-        }
-        const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
-        buffer[tensor_index] = result;
-      }
-    }
-
-    __syncthreads();
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D( 
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
-    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
-    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
-    const size_t kernelSizeZ, float* buffer) {
-  HIP_DYNAMIC_SHARED( float, s)
-
-  // Load inputs to shared memory
-  const int first_x = hipBlockIdx_x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + kernelSizeX;
-
-  const int first_y = hipBlockIdx_y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + kernelSizeY;
-
-  const int first_z = hipBlockIdx_z * maxZ;
-  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
-  const int num_z_input = last_z - first_z + kernelSizeZ;
-
-  for (int p = 0; p < numPlanes; ++p) {
-
-    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = 0;
-
-    for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) {
-      for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
-        for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
-          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
-        }
-      }
-    }
-
-    __syncthreads();
-
-    // Convolution
-    const int num_z_output = last_z - first_z + 1;
-    const int num_y_output = last_y - first_y + 1;
-    const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
-    for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) {
-      for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
-        for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
-          float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
-                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
-              }
-            }
-          }
-          const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
-          buffer[tensor_index] = result;
-        }
-      }
-    }
-    __syncthreads();
-  }
-};
-
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
-{
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
-    PacketAccess = false,
-    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-    }
-  }
- 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    preloadKernel();
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      executeEval(data);
-      return false;
-    } else {
-      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
-      executeEval(m_buf);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_buf) {
-      m_device.deallocate(m_buf);
-      m_buf = NULL;
-    }
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-
-  EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
-
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  static unsigned int ceil(unsigned int num, unsigned int denom) {
-    const unsigned int rounded_toward_zero = num / denom;
-    if (num > rounded_toward_zero * denom) {
-      return rounded_toward_zero + 1;
-    }
-    return rounded_toward_zero;
-  }
-
-  void executeEval(Scalar* data) const {
-    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
-
-    const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumHipMultiProcessors();
-    const int hipWarpSize = 32;
-
-    switch (NumKernelDims) {
-      case 1: {
-        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
-
-        const int numX = dimensions()[m_indices[0]];
-        const int numP = dimensions().TotalSize() / numX;
-        int maxX;
-        dim3 block_size;
-
-        const int single_stride_dim =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                ? 0
-                : m_inputImpl.dimensions().rank() - 1;
-        if (m_indices[0] == single_stride_dim) {
-          // Maximum the reuse
-          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
-          maxX = numext::mini<int>(inner_dim, numX);
-          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
-          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
-          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
-        }
-        else {
-          // Read as much as possible alongside the inner most dimension, that is the plane
-          const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar));
-          const int maxP = numext::mini<int>(inner_dim, numP);
-          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
-
-          block_size.x = numext::mini(hipWarpSize, maxX);
-          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
-        }
-
-        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
-
-        const int num_x_blocks = ceil(numX, maxX);
-        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
-        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
-
-        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
-
-
-        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
-        const array<Index, 1> indices(m_indices[0]);
-        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
-        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-        switch(kernel_size) {
-          case 4: {
-            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
-                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
-            break;
-          }
-          case 7: {
-            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
-                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
-            break;
-          }
-          default: {
-            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
-                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
-          }
-        }
-        break;
-      }
-
-      case 2: {
-        const int idxX =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
-        const int idxY =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
-        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
-        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
-
-        const int numX = dimensions()[m_indices[idxX]];
-        const int numY = dimensions()[m_indices[idxY]];
-        const int numP = dimensions().TotalSize() / (numX*numY);
-
-        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
-
-        // Snap maxX to warp size
-        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
-        const int maxX = numext::mini<int>(inner_dim, numX);
-        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
-        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
-
-        dim3 block_size;
-        block_size.x = numext::mini(1024, maxX);
-        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
-        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
-
-        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
-
-        const int num_x_blocks = ceil(numX, maxX);
-        const int num_y_blocks = ceil(numY, maxY);
-        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
-        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
-
-        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
-
-
-        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
-        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
-        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
-                                          m_kernelImpl.dimensions()[idxY]);
-        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-        switch (kernel_size_x) {
-          case 4: {
-            switch (kernel_size_y) {
-              case 7: {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
-                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
-                break;
-              }
-              default: {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
-                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
-                break;
-              }
-            }
-            break;
-          }
-          case 7: {
-            switch (kernel_size_y) {
-              case 4: {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
-                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
-                break;
-              }
-              default: {
-                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
-                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
-                break;
-              }
-            }
-            break;
-          }
-          default: {
-            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>),
-                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
-            break;
-          }
-        }
-        break;
-      }
-
-      case 3: {
-        const int idxX =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
-        const int idxY =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
-        const int idxZ =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
-
-        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
-        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
-        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
-
-        const int numX = dimensions()[m_indices[idxX]];
-        const int numY = dimensions()[m_indices[idxY]];
-        const int numZ = dimensions()[m_indices[idxZ]];
-        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
-
-        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
-        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
-        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
-
-        dim3 block_size;
-        block_size.x = numext::mini(32, maxX);
-        block_size.y = numext::mini(32, maxY);
-        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
-        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
-
-        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
-
-        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
-                                      m_indices[idxZ]);
-        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
-                                          m_kernelImpl.dimensions()[idxY],
-                                          m_kernelImpl.dimensions()[idxZ]);
-        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
-                        dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel,
-                        numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
-        break;
-      }
-
-      default: {
-        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return m_buf[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
-    // model.
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
-  }
-
- private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-
-  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
-  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
-  KernelArgType m_kernelArg;
-  Indices m_indices;
-  Dimensions m_dimensions;
-  Scalar* m_buf;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-
-  const GpuDevice& m_device;
-};
-#endif
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
deleted file mode 100644
index c0e110987..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
+++ /dev/null
@@ -1,352 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
-
-#if defined(EIGEN_HIPCC)
-#include "hip/hip_runtime.h"
-#include "hip/hip_runtime_api.h"
-#endif
-#include <unistd.h> //for sleep function
-
-namespace Eigen {
-
-static const int kHipScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// HIP streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const hipStream_t& stream() const = 0;
-  virtual const hipDeviceProp_t& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-static hipDeviceProp_t* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-#if 0 && __cplusplus >= 201103L
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-#else
-    static bool first = true;
-    if (first) {
-      first = false;
-#endif
-      // We're the first thread to reach this point.
-      int num_devices;
-      hipError_t status = hipGetDeviceCount(&num_devices);
-      if (status != hipSuccess) {
-        std::cerr << "Failed to get the number of HIP devices: "
-                  << hipGetErrorString(status)
-                  << std::endl;
-        assert(status == hipSuccess);
-      }
-      m_deviceProperties = new hipDeviceProp_t[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = hipGetDeviceProperties(&m_deviceProperties[i], i);
-        if (status != hipSuccess) {
-          std::cerr << "Failed to initialize HIP device #"
-                    << i
-                    << ": "
-                    << hipGetErrorString(status)
-                    << std::endl;
-          assert(status == hipSuccess);
-        }
-      }
-
-#if 0 && __cplusplus >= 201103L
-      std::atomic_thread_fence(std::memory_order_release);
-#endif
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-#if 0 && __cplusplus >= 201103L
-        std::atomic_thread_fence(std::memory_order_acquire);
-#endif
-        sleep(1);
-      }
-    }
-  }
-}
-
-static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
-
-class HipStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    hipGetDevice(&device_);
-    initializeDeviceProp();
-  }
-  // Use the default stream on the specified device
-  HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  HipStreamDevice(const hipStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      hipGetDevice(&device_);
-    } else {
-      int num_devices;
-      hipError_t err = hipGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == hipSuccess);
-      assert(device < num_devices);
-      device_ = device;
-    }
-    initializeDeviceProp();
-  }
-
-  virtual ~HipStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const hipStream_t& stream() const { return *stream_; }
-  const hipDeviceProp_t& deviceProperties() const {
-    return m_deviceProperties[device_];
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    hipError_t err = hipSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-    void* result;
-    err = hipMalloc(&result, num_bytes);
-    assert(err == hipSuccess);
-    assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    hipError_t err = hipSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-    assert(buffer != NULL);
-    err = hipFree(buffer);
-    assert(err == hipSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == hipSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const hipStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const hipStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
-    hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    hipError_t err =
-        hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-  }
-
-   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    hipError_t err =
-        hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
-    //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
-    hipError_t err = hipMemset(buffer, c, n);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == hipSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on hip devices.
-    return firstLevelCacheSize();
-  }
-
-// FIXME - this will move into HIP
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-#undef assert
-#define assert(COND)
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(EIGEN_HIPCC) && \
-    !defined(EIGEN_HIP_DEVICE_COMPILE)
-    hipError_t err = hipStreamSynchronize(stream_->stream());
-    if (err != hipSuccess) {
-      std::cerr << "Error detected in HIP stream: "
-                << hipGetErrorString(err)
-                << std::endl;
-      assert(err == hipSuccess);
-    }
-#else
-    assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the HIP runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#if defined(EIGEN_HIPCC)
-    hipError_t error = hipStreamQuery(stream_->stream());
-    return (error == hipSuccess) || (error == hipErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
-  assert(hipGetLastError() == hipSuccess);
-
-
-// FIXME: Should be device and kernel specific.
-#if defined(EIGEN_HIPCC)
-static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
-  hipError_t status = hipDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  assert(status == hipSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
deleted file mode 100644
index 5304a22c5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
+++ /dev/null
@@ -1,815 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
-#endif
-
-#if defined(EIGEN_HIPCC)
-#define HIP_WARP_SIZE 64
-#endif
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple hip thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another hip thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    assert(0 && "Wordsize not supported");
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
-#endif
-
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
-    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
-  atomicAdd(output, accum);
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-  const Index num_threads = hipBlockDim_x * hipGridDim_x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
-    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
-  // Initialize the output value
-  const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
-  if (hipGridDim_x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (hipThreadIdx_x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
-    // XXX use std::is_floating_point to determine the type of accum
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
-    }
-  }
-
-  if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, hipGridDim_x + 1);
-    __threadfence_system();
-  }
-
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
-  eigen_assert(hipBlockDim_x == 1);
-  eigen_assert(hipGridDim_x == 1);
-  if (num_coeffs % 2 != 0) {
-    half last = input.m_impl.coeff(num_coeffs-1);
-    *scratch = __halves2half2(last, reducer.initialize());
-  } else {
-    *scratch = reducer.template initializePacket<half2>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-  const Index num_threads = hipBlockDim_x * hipGridDim_x;
-  const Index num_packets = num_coeffs / 2;
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    ((half2*)output)[i] = reducer.template initializePacket<half2>();
-  }
-
-  if (thread_id == 0 && num_coeffs % 2 != 0) {
-    output[num_coeffs-1] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, half2* scratch) {
-  eigen_assert(NumPerThread % 2 == 0);
-
-  const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (hipGridDim_x == 1 && first_index == 0) {
-    if (num_coeffs % 2 != 0) {
-      half last = input.m_impl.coeff(num_coeffs-1);
-      *scratch = __halves2half2(last, reducer.initialize());
-    } else {
-      *scratch = reducer.template initializePacket<half2>();
-    }
-    __syncthreads();
-  }
-
-  half2 accum = reducer.template initializePacket<half2>();
-  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + 2*i;
-    eigen_assert(index + 1 < num_coeffs);
-    half2 val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
-    // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-    union { int i; half2 h; } wka_in, wka_out;
-    wka_in.h = accum;
-    wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
-    reducer.reducePacket(wka_out.h, &accum);
-  }
-
-  if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-
-  if (hipGridDim_x == 1 && first_index == 0) {
-    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), &tmp);
-    *output = tmp;
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
-  eigen_assert(hipThreadIdx_x == 1);
-  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), &tmp);
-  *output = tmp;
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-namespace {
-  std::mutex __eigen_reduction_hip_mutex;
-}
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-    // guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
-    // is dispatched at a time
-    std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
-
-    typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-
-      unsigned int semaphore_host = 0xFF;
-      hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
-      if (semaphore_host != 0) {
-        std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
-                  << semaphore_host << "\n";
-        // wait for all commands on the device to complete so semaphore value
-        // is reset to 0
-        hipDeviceSynchronize();
-
-        // read again
-        hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
-        if (semaphore_host != 0) {
-          std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
-                    << semaphore_host << ", apply manual override to 0\n";
-
-          // force set semaphore value to be 0
-          semaphore_host = 0;
-          hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
-        }
-      }
-    }
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
-  }
-};
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
-    }
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
-                         dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
-    }
-  }
-};
-#endif
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#if defined(EIGEN_HAS_HIP_FP16)
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
-    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(hipBlockDim_y == 1);
-  eigen_assert(hipBlockDim_z == 1);
-  eigen_assert(hipGridDim_y == 1);
-  eigen_assert(hipGridDim_z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = hipBlockDim_x * hipGridDim_x;
-  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (hipGridDim_x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + hipBlockDim_x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
-        // XXX use std::is_floating_point to determine the type of reduced_val
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-      }
-
-      if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-#if defined(EIGEN_HAS_HIP_FP16)
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(hipBlockDim_y == 1);
-  eigen_assert(hipBlockDim_z == 1);
-  eigen_assert(hipGridDim_y == 1);
-  eigen_assert(hipGridDim_z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = hipBlockDim_x * hipGridDim_x;
-  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (hipGridDim_x == 1) {
-    Index i = 2*thread_id;
-    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
-      half* loc = output + i;
-      *((half2*)loc) = reducer.template initializePacket<half2>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
-    const Index row = 2 * (i / input_col_blocks);
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
-
-      half2 reduced_val1 = reducer.template initializePacket<half2>();
-      half2 reduced_val2 = reducer.template initializePacket<half2>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + hipBlockDim_x * j;
-          for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
-            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            // Peel;
-            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            const half2 val1 = __halves2half2(last1, reducer.initialize());
-            reducer.reducePacket(val1, &reduced_val1);
-            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
-            const half2 val2 = __halves2half2(last2, reducer.initialize());
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
-	// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-	union { int i; half2 h; } wka_in, wka_out;
-
-	wka_in.h = reduced_val1;
-	wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
-        reducer.reducePacket(wka_out.h, &reduced_val1);
-	
-	wka_in.h = reduced_val2;
-	wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
-        reducer.reducePacket(wka_out.h, &reduced_val2);
-      }
-
-      half val1 =  __low2half(reduced_val1);
-      reducer.reduce(__high2half(reduced_val1), &val1);
-      half val2 =  __low2half(reduced_val2);
-      reducer.reduce(__high2half(reduced_val2), &val2);
-      half2 val = __halves2half2(val1, val2);
-
-      if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumHipMultiProcessors() *
-                           device.maxHipThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumHipMultiProcessors() *
-                           device.maxHipThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
-                         dim3(num_blocks), dim3(1024), 0, device.stream(),
-                         reducer.initialize(), num_preserved_vals, output);
-    }
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
-                       num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumHipMultiProcessors() *
-                           device.maxHipThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumHipMultiProcessors() *
-                           device.maxHipThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
-    }
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#if defined(EIGEN_HAS_HIP_FP16)
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = hipBlockDim_x * hipGridDim_x;
-  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (hipGridDim_x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumHipMultiProcessors() *
-                           device.maxHipThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumHipMultiProcessors() *
-                             device.maxHipThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
-                         dim3(num_blocks), dim3(1024), 0, device.stream(),
-                         reducer.initialize(), num_preserved_vals, output);
-    }
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H