aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h1521
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h1119
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h352
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h815
-rw-r--r--unsupported/test/cxx11_tensor_argmax_hip.cu251
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_hip.cu79
-rw-r--r--unsupported/test/cxx11_tensor_contract_hip.cu215
-rw-r--r--unsupported/test/cxx11_tensor_device_hip.cu389
-rw-r--r--unsupported/test/cxx11_tensor_hip.cu1295
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_hip.cu498
-rw-r--r--unsupported/test/cxx11_tensor_random_hip.cu85
-rw-r--r--unsupported/test/cxx11_tensor_reduction_hip.cu154
-rw-r--r--unsupported/test/cxx11_tensor_scan_hip.cu76
13 files changed, 0 insertions, 6849 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
deleted file mode 100644
index 7561846a3..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
+++ /dev/null
@@ -1,1521 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
- const Index m_size, const Index n_size, const Index k_size) {
-
- const Index m_block_idx = hipBlockIdx_x;
- const Index n_block_idx = hipBlockIdx_y;
-
- const Index base_m = 64 * m_block_idx;
- const Index base_n = 64 * n_block_idx;
-
- // declare and initialize 64 registers for output 8x8 block
-
- // prefetch registers
- Scalar lhs_pf0;
- Scalar lhs_pf1;
- Scalar lhs_pf2;
- Scalar lhs_pf3;
- Scalar lhs_pf4;
- Scalar lhs_pf5;
- Scalar lhs_pf6;
- Scalar lhs_pf7;
-
- Scalar rhs_pf0;
- Scalar rhs_pf1;
- Scalar rhs_pf2;
- Scalar rhs_pf3;
- Scalar rhs_pf4;
- Scalar rhs_pf5;
- Scalar rhs_pf6;
- Scalar rhs_pf7;
-
- // shared memory is formatted
- // (contract idx in block, nocontract idx in block, block idx)
- // where block idx is column major. This transposition limits the number of
- // bank conflicts when reading the LHS. The core idea is that since the contracting
- // index is shared by both sides, then the contracting index should be in hipThreadIdx_x.
-
- // On the LHS, we pad each row inside of each block with an extra element. This makes
- // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
- // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
- // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
- // conflicts on writes and also none on reads.
-
- // storage indices
- const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z;
- const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x;
-
- const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
- const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
- const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
- const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
- const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
- const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
- const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
- const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
- const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
- const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
- const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
- const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
- const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
- const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
- const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
- const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
- // in the loading code, the following variables are important:
- // hipThreadIdx_x: the vertical position in an 8x8 block
- // hipThreadIdx_y: the vertical index of the 8x8 block in the grid
- // hipThreadIdx_z: the horizontal position in an 8x8 block
- // k: the horizontal index of the 8x8 block in the grid
- //
- // The k parameter is implicit (it was the loop counter for a loop that went
- // from 0 to <8, but now that loop is unrolled in the below code.
-
- const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y;
- const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k) \
- { \
- lhs_pf0 = conv(0); \
- lhs_pf1 = conv(0); \
- lhs_pf2 = conv(0); \
- lhs_pf3 = conv(0); \
- lhs_pf4 = conv(0); \
- lhs_pf5 = conv(0); \
- lhs_pf6 = conv(0); \
- lhs_pf7 = conv(0); \
- \
- rhs_pf0 = conv(0); \
- rhs_pf1 = conv(0); \
- rhs_pf2 = conv(0); \
- rhs_pf3 = conv(0); \
- rhs_pf4 = conv(0); \
- rhs_pf5 = conv(0); \
- rhs_pf6 = conv(0); \
- rhs_pf7 = conv(0); \
- \
- if (!needs_edge_check || lhs_vert < m_size) { \
- const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \
- const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \
- const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \
- const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \
- const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \
- const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \
- const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \
- const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \
- \
- if (!needs_edge_check || lhs_horiz_7 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
- lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
- } else if (lhs_horiz_6 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
- } else if (lhs_horiz_5 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
- } else if (lhs_horiz_4 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
- } else if (lhs_horiz_3 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
- } else if (lhs_horiz_2 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
- } else if (lhs_horiz_1 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
- } else if (lhs_horiz_0 < k_size) { \
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
- } \
- } \
- \
- const Index rhs_vert = base_k + load_idx_vert; \
- if (!needs_edge_check || rhs_vert < k_size) { \
- const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \
- const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \
- const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \
- const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \
- const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \
- const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \
- const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \
- const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \
- \
- if (rhs_horiz_7 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
- rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
- } else if (rhs_horiz_6 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
- } else if (rhs_horiz_5 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
- } else if (rhs_horiz_4 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
- } else if (rhs_horiz_3 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
- } else if (rhs_horiz_2 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
- } else if (rhs_horiz_1 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
- } else if (rhs_horiz_0 < n_size) { \
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
- } \
- } \
- } \
-
-#define writeRegToShmem(_) \
- lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
- rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
- \
- lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
- rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
- \
- lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
- rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
- \
- lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
- rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
- \
- lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
- rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
- \
- lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
- rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
- \
- lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
- rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
- \
- lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
- rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
-
- // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i) \
- Scalar res(i, 0) = conv(0); \
- Scalar res(i, 1) = conv(0); \
- Scalar res(i, 2) = conv(0); \
- Scalar res(i, 3) = conv(0); \
- Scalar res(i, 4) = conv(0); \
- Scalar res(i, 5) = conv(0); \
- Scalar res(i, 6) = conv(0); \
- Scalar res(i, 7) = conv(0); \
-
- internal::scalar_cast_op<int, Scalar> conv;
- initResultRow(0);
- initResultRow(1);
- initResultRow(2);
- initResultRow(3);
- initResultRow(4);
- initResultRow(5);
- initResultRow(6);
- initResultRow(7);
-#undef initResultRow
-
- for (Index base_k = 0; base_k < k_size; base_k += 64) {
- // wait for previous iteration to finish with shmem. Despite common sense,
- // the code is a bit faster with this here then at bottom of loop
- __syncthreads();
-
- prefetchIntoRegisters(base_k);
- writeRegToShmem();
-
- #undef prefetchIntoRegisters
- #undef writeRegToShmem
-
- // wait for shared mem packing to be done before starting computation
- __syncthreads();
-
- // compute 8x8 matrix product by outer product. This involves packing one column
- // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
- Scalar lcol(0);
- Scalar lcol(1);
- Scalar lcol(2);
- Scalar lcol(3);
- Scalar lcol(4);
- Scalar lcol(5);
- Scalar lcol(6);
- Scalar lcol(7);
-
-#define rrow(j) _rrow##j
- Scalar rrow(0);
- Scalar rrow(1);
- Scalar rrow(2);
- Scalar rrow(3);
- Scalar rrow(4);
- Scalar rrow(5);
- Scalar rrow(6);
- Scalar rrow(7);
-
- // Now x corresponds to k, y to m, and z to n
- const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y];
- const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j) \
- lcol(0) = lhs_element(0, j); \
- rrow(0) = rhs_element(i, 0); \
- lcol(1) = lhs_element(1, j); \
- rrow(1) = rhs_element(i, 1); \
- lcol(2) = lhs_element(2, j); \
- rrow(2) = rhs_element(i, 2); \
- lcol(3) = lhs_element(3, j); \
- rrow(3) = rhs_element(i, 3); \
- lcol(4) = lhs_element(4, j); \
- rrow(4) = rhs_element(i, 4); \
- lcol(5) = lhs_element(5, j); \
- rrow(5) = rhs_element(i, 5); \
- lcol(6) = lhs_element(6, j); \
- rrow(6) = rhs_element(i, 6); \
- lcol(7) = lhs_element(7, j); \
- rrow(7) = rhs_element(i, 7); \
-
-#define computeCol(j) \
- res(0, j) += lcol(0) * rrow(j); \
- res(1, j) += lcol(1) * rrow(j); \
- res(2, j) += lcol(2) * rrow(j); \
- res(3, j) += lcol(3) * rrow(j); \
- res(4, j) += lcol(4) * rrow(j); \
- res(5, j) += lcol(5) * rrow(j); \
- res(6, j) += lcol(6) * rrow(j); \
- res(7, j) += lcol(7) * rrow(j); \
-
-#define computePass(i) \
- loadData(i, i); \
- \
- computeCol(0); \
- computeCol(1); \
- computeCol(2); \
- computeCol(3); \
- computeCol(4); \
- computeCol(5); \
- computeCol(6); \
- computeCol(7); \
-
- computePass(0);
- computePass(1);
- computePass(2);
- computePass(3);
- computePass(4);
- computePass(5);
- computePass(6);
- computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
- } // end loop over k
-
- // we've now iterated over all of the large (ie width 64) k blocks and
- // accumulated results in registers. At this point thread (x, y, z) contains
- // the sum across all big k blocks of the product of little k block of index (x, y)
- // with block of index (y, z). To compute the final output, we need to reduce
- // the 8 threads over y by summation.
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-
-#define reduceRow(i, mask) \
- shuffleInc(i, 0, mask); \
- shuffleInc(i, 1, mask); \
- shuffleInc(i, 2, mask); \
- shuffleInc(i, 3, mask); \
- shuffleInc(i, 4, mask); \
- shuffleInc(i, 5, mask); \
- shuffleInc(i, 6, mask); \
- shuffleInc(i, 7, mask); \
-
-#define reduceMatrix(mask) \
- reduceRow(0, mask); \
- reduceRow(1, mask); \
- reduceRow(2, mask); \
- reduceRow(3, mask); \
- reduceRow(4, mask); \
- reduceRow(5, mask); \
- reduceRow(6, mask); \
- reduceRow(7, mask); \
-
- // actually perform the reduction, now each thread of index (_, y, z)
- // contains the correct values in its registers that belong in the output
- // block
- reduceMatrix(1);
- reduceMatrix(2);
- reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
- // now we need to copy the 64 values into main memory. We can't split work
- // among threads because all variables are in registers. There's 2 ways
- // to do this:
- // (1) have 1 thread do 64 writes from registers into global memory
- // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
- // each do 8 writes into global memory. We can just overwrite the shared
- // memory from the problem we just solved.
- // (2) is slightly faster than (1) due to less branching and more ILP
-
- // TODO: won't yield much gain, but could just use currently unused shared mem
- // and then we won't have to sync
- // wait for shared mem to be out of use
- __syncthreads();
-
-#define writeResultShmem(i, j) \
- lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \
-
-#define writeRow(i) \
- writeResultShmem(i, 0); \
- writeResultShmem(i, 1); \
- writeResultShmem(i, 2); \
- writeResultShmem(i, 3); \
- writeResultShmem(i, 4); \
- writeResultShmem(i, 5); \
- writeResultShmem(i, 6); \
- writeResultShmem(i, 7); \
-
- if (hipThreadIdx_x == 0) {
- writeRow(0);
- writeRow(1);
- writeRow(2);
- writeRow(3);
- writeRow(4);
- writeRow(5);
- writeRow(6);
- writeRow(7);
- }
-#undef writeResultShmem
-#undef writeRow
-
- const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8);
- const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8);
-
- if (hipThreadIdx_x < max_i_write) {
- if (max_j_write == 8) {
- // TODO: can i trade bank conflicts for coalesced writes?
- Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0];
- Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1];
- Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2];
- Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3];
- Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4];
- Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5];
- Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6];
- Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7];
-
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6;
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7;
- } else {
-#pragma unroll 7
- for (int j = 0; j < max_j_write; j++) {
- Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j];
- output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val;
- }
- }
- }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512, 1)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output,
- const Index m_size, const Index n_size, const Index k_size) {
- __shared__ Scalar lhs_shmem[72 * 64];
- __shared__ Scalar rhs_shmem[72 * 64];
-
- const Index m_block_idx = hipBlockIdx_x;
- const Index n_block_idx = hipBlockIdx_y;
-
- const Index base_m = 64 * m_block_idx;
- const Index base_n = 64 * n_block_idx;
-
- if (base_m + 63 < m_size && base_n + 63 < n_size) {
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
- } else {
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
- }
-}
-
-
-template<typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
- bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output, float2 lhs_shmem2[][16],
- float2 rhs_shmem2[][8], const Index m_size,
- const Index n_size, const Index k_size,
- const Index base_m, const Index base_n) {
-
- // prefetch registers
- float4 lhs_pf0, rhs_pf0;
-
- float4 results[4];
- for (int i=0; i < 4; i++) {
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
- }
-
-
-#define prefetch_lhs(reg, row, col) \
- if (!CHECK_LHS_BOUNDARY) { \
- if (col < k_size) { \
- /*reg = lhs.template loadPacket<Unaligned>(row, col);*/ \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- reg.w =lhs(row + 3, col); \
- } \
- } else { \
- if (col < k_size) { \
- if (row + 3 < m_size) { \
- /*reg =lhs.template loadPacket<Unaligned>(row, col);*/ \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- reg.w =lhs(row + 3, col); \
- } else if (row + 2 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- } else if (row + 1 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- } else if (row < m_size) { \
- reg.x =lhs(row + 0, col); \
- } \
- } \
- } \
-
-
- Index lhs_vert = base_m+hipThreadIdx_x*4;
-
- for (Index k = 0; k < k_size; k += 16) {
- //lhs_pf0 = internal::pset1<float4>(0);
- //rhs_pf0 = internal::pset1<float4>(0);
- lhs_pf0 = make_float4(0, 0, 0, 0);
- rhs_pf0 = make_float4(0, 0, 0, 0);
-
- Index lhs_horiz = hipThreadIdx_y+k;
- prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
- Index rhs_vert = k+(hipThreadIdx_x%4)*4;
- Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n;
-
- if (!CHECK_RHS_BOUNDARY) {
- if ((rhs_vert + 3) < k_size) {
- // just CHECK_RHS_BOUNDARY
- //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
- } else if (rhs_vert + 2 < k_size) {
- // just CHECK_RHS_BOUNDARY
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- } else if (rhs_vert + 1 < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- } else if (rhs_vert < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- }
- } else {
- if (rhs_horiz0 < n_size) {
- if ((rhs_vert + 3) < k_size) {
- //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
- } else if ((rhs_vert + 2) < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- } else if ((rhs_vert + 1) < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- } else if (rhs_vert < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- }
- }
- }
- float x1, x2 ;
- // the following can be a bitwise operation..... some day.
- if((hipThreadIdx_x%8) < 4) {
- x1 = rhs_pf0.y;
- x2 = rhs_pf0.w;
- } else {
- x1 = rhs_pf0.x;
- x2 = rhs_pf0.z;
- }
- x1 = __shfl_xor(x1, 4);
- x2 = __shfl_xor(x2, 4);
- if((hipThreadIdx_x%8) < 4) {
- rhs_pf0.y = x1;
- rhs_pf0.w = x2;
- } else {
- rhs_pf0.x = x1;
- rhs_pf0.z = x2;
- }
-
- // We have 64 features.
- // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
- // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
- // ...
- // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
- // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
- // ...
- rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
- rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
- // ...
- // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
- // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
- // ...
-
- lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y);
- lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
- results[0].x += fl1.x * fr1.x;\
- results[0].y += fl1.y * fr1.x;\
- results[0].z += fl2.x * fr1.x;\
- results[0].w += fl2.y * fr1.x;\
-\
- results[1].x += fl1.x * fr1.y;\
- results[1].y += fl1.y * fr1.y;\
- results[1].z += fl2.x * fr1.y;\
- results[1].w += fl2.y * fr1.y;\
-\
- results[2].x += fl1.x * fr2.x;\
- results[2].y += fl1.y * fr2.x;\
- results[2].z += fl2.x * fr2.x;\
- results[2].w += fl2.y * fr2.x;\
-\
- results[3].x += fl1.x * fr2.y;\
- results[3].y += fl1.y * fr2.y;\
- results[3].z += fl2.x * fr2.y;\
- results[3].w += fl2.y * fr2.y;\
-
- __syncthreads();
-
- // Do the multiplies.
- #pragma unroll
- for (int koff = 0; koff < 16; koff ++) {
- // 32 x threads.
- float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x];
- float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x];
-
- int start_feature = hipThreadIdx_y * 4;
- float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
- float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
- add_vals(fl1, fl2, fr1, fr2)
- }
- __syncthreads();
- }
-
-#undef prefetch_lhs
-#undef add_vals
-
- Index horiz_base = hipThreadIdx_y*4+base_n;
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
- for (int i = 0; i < 4; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- } else if (!CHECK_RHS_BOUNDARY) {
- // CHECK LHS
- if (lhs_vert + 3 < m_size) {
- for (int i = 0; i < 4; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- } else if (lhs_vert + 2 < m_size) {
- for (int i = 0; i < 4; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- }
- } else if (lhs_vert + 1 < m_size) {
- for (int i = 0; i < 4; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- }
- } else if (lhs_vert < m_size) {
- for (int i = 0; i < 4; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- }
- }
- } else if (!CHECK_LHS_BOUNDARY) {
- // CHECK RHS
- /*
- int ncols_rem = fminf(n_size- horiz_base, 4);
- for (int i = 0; i < ncols_rem; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }*/
- for (int i = 0; i < 4; i++) {
- if (horiz_base+i < n_size) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- }
- } else {
- // CHECK both boundaries.
- for (int i = 0; i < 4; i++) {
- if (horiz_base+i < n_size) {
- if (lhs_vert < m_size)
- output(lhs_vert, horiz_base + i) = results[i].x;
- if (lhs_vert + 1 < m_size)
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- if (lhs_vert + 2 < m_size)
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- if (lhs_vert + 3 < m_size)
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- }
- }
-}
-
-
-template<typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
- bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output, float2 lhs_shmem2[][32],
- float2 rhs_shmem2[][8], const Index m_size,
- const Index n_size, const Index k_size,
- const Index base_m, const Index base_n) {
-
- // prefetch registers
- float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
- float4 rhs_pf0, rhs_pf1;
-
- float4 results[8];
- for (int i=0; i < 8; i++) {
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
- }
-
-
- Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32;
- for (Index k = 0; k < k_size; k += 32) {
- /*lhs_pf0 = internal::pset1<float4>(0);
- lhs_pf1 = internal::pset1<float4>(0);
- lhs_pf2 = internal::pset1<float4>(0);
- lhs_pf3 = internal::pset1<float4>(0);
-
- rhs_pf0 = internal::pset1<float4>(0);
- rhs_pf1 = internal::pset1<float4>(0);*/
-
-
- lhs_pf0 = make_float4(0, 0, 0, 0);
- lhs_pf1 = make_float4(0, 0, 0, 0);
- lhs_pf2 = make_float4(0, 0, 0, 0);
- lhs_pf3 = make_float4(0, 0, 0, 0);
-
- rhs_pf0 = make_float4(0, 0, 0, 0);
- rhs_pf1 = make_float4(0, 0, 0, 0);
-
- if (!CHECK_LHS_BOUNDARY) {
- if ((hipThreadIdx_y/4+k+24) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
- //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
- lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
- lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
- lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
- } else if ((hipThreadIdx_y/4+k+16) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- } else if ((hipThreadIdx_y/4+k+8) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- } else if ((hipThreadIdx_y/4+k) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- }
- } else {
- // just CHECK_LHS_BOUNDARY
- if (lhs_vert + 3 < m_size) {
- if ((hipThreadIdx_y/4+k+24) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
- //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
- lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
- lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
- lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
- } else if ((hipThreadIdx_y/4+k+16) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- } else if ((hipThreadIdx_y/4+k+8) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- } else if ((hipThreadIdx_y/4+k) < k_size) {
- //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- }
- } else if (lhs_vert + 2 < m_size) {
- if ((hipThreadIdx_y/4+k+24) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
- lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
- lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
- } else if ((hipThreadIdx_y/4+k+16) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
- } else if ((hipThreadIdx_y/4+k+8) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
- } else if ((hipThreadIdx_y/4+k) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
- }
- } else if (lhs_vert + 1 < m_size) {
- if ((hipThreadIdx_y/4+k+24) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
- lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
- } else if ((hipThreadIdx_y/4+k+16) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
- } else if ((hipThreadIdx_y/4+k+8) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
- } else if ((hipThreadIdx_y/4+k) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
- }
- } else if (lhs_vert < m_size) {
- if ((hipThreadIdx_y/4+k+24) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
- } else if ((hipThreadIdx_y/4+k+16) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
- } else if ((hipThreadIdx_y/4+k+8) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
- } else if ((hipThreadIdx_y/4+k) < k_size) {
- lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
- }
- }
- }
- __syncthreads();
- Index rhs_vert = k+hipThreadIdx_x*4;
- Index rhs_horiz0 = hipThreadIdx_y*2+base_n;
- Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n;
- if (!CHECK_RHS_BOUNDARY) {
- if ((rhs_vert + 3) < k_size) {
- // just CHECK_RHS_BOUNDARY
- //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
- rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
- } else if (rhs_vert + 2 < k_size) {
- // just CHECK_RHS_BOUNDARY
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
- } else if (rhs_vert + 1 < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- } else if (rhs_vert < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- }
- } else {
- if (rhs_horiz1 < n_size) {
- if ((rhs_vert + 3) < k_size) {
- // just CHECK_RHS_BOUNDARY
- //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
- rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
- } else if (rhs_vert + 2 < k_size) {
- // just CHECK_RHS_BOUNDARY
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
- } else if (k+hipThreadIdx_x*4 + 1 < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
- } else if (k+hipThreadIdx_x*4 < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
- }
- } else if (rhs_horiz0 < n_size) {
- if ((rhs_vert + 3) < k_size) {
- // just CHECK_RHS_BOUNDARY
- //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
- } else if ((rhs_vert + 2) < k_size) {
- // just CHECK_RHS_BOUNDARY
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
- } else if ((rhs_vert + 1) < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
- } else if (rhs_vert < k_size) {
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
- }
- }
- }
- __syncthreads();
- // Loaded. Do computation
- // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
- // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
- // ..
- // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
- rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x);
- // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
- // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
- // ..
- rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y);
- // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
- // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
- rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z);
- // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
- // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
- rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
- // LHS.
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
- // ...
- // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
- // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
- results[0].x += a_feat1.x * f1.x;\
- results[1].x += a_feat1.x * f1.y;\
- results[2].x += a_feat1.x * f2.x;\
- results[3].x += a_feat1.x * f2.y;\
- results[4].x += a_feat1.x * f3.x;\
- results[5].x += a_feat1.x * f3.y;\
- results[6].x += a_feat1.x * f4.x;\
- results[7].x += a_feat1.x * f4.y;\
-\
- results[0].y += a_feat1.y * f1.x;\
- results[1].y += a_feat1.y * f1.y;\
- results[2].y += a_feat1.y * f2.x;\
- results[3].y += a_feat1.y * f2.y;\
- results[4].y += a_feat1.y * f3.x;\
- results[5].y += a_feat1.y * f3.y;\
- results[6].y += a_feat1.y * f4.x;\
- results[7].y += a_feat1.y * f4.y;\
-\
- results[0].z += a_feat2.x * f1.x;\
- results[1].z += a_feat2.x * f1.y;\
- results[2].z += a_feat2.x * f2.x;\
- results[3].z += a_feat2.x * f2.y;\
- results[4].z += a_feat2.x * f3.x;\
- results[5].z += a_feat2.x * f3.y;\
- results[6].z += a_feat2.x * f4.x;\
- results[7].z += a_feat2.x * f4.y;\
-\
- results[0].w += a_feat2.y * f1.x;\
- results[1].w += a_feat2.y * f1.y;\
- results[2].w += a_feat2.y * f2.x;\
- results[3].w += a_feat2.y * f2.y;\
- results[4].w += a_feat2.y * f3.x;\
- results[5].w += a_feat2.y * f3.y;\
- results[6].w += a_feat2.y * f4.x;\
- results[7].w += a_feat2.y * f4.y;\
-
- lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
- lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
- lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
- lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
- lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
- lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
- lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
- lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
- __syncthreads();
-
- // Do the multiplies.
- #pragma unroll
- for (int koff = 0; koff < 32; koff ++) {
- float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
- float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
-
- // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8.
- int start_feature = (hipThreadIdx_y / 4) * 8;
-
- float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
- float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
- float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
- float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
- add_vals(a3, a4, br1, br2, br3, br4)
- }
- __syncthreads();
- } // end loop over k
-
-
- __syncthreads();
- Index horiz_base = (hipThreadIdx_y/4)*8+base_n;
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
- for (int i = 0; i < 8; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- } else if (!CHECK_RHS_BOUNDARY) {
- if (lhs_vert + 3 < m_size) {
- for (int i = 0; i < 8; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- } else if (lhs_vert + 2 < m_size) {
- for (int i = 0; i < 8; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- }
- } else if (lhs_vert + 1 < m_size) {
- for (int i = 0; i < 8; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- }
- } else if (lhs_vert < m_size) {
- for (int i = 0; i < 8; i++) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- }
- }
- } else if (!CHECK_LHS_BOUNDARY) {
- // CHECK BOUNDARY_B
- for (int i = 0; i < 8; i++) {
- if (horiz_base + i < n_size) {
- output(lhs_vert, horiz_base + i) = results[i].x;
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- }
- } else {
- // CHECK both boundaries.
- for (int i = 0; i < 8; i++) {
- if (horiz_base + i < n_size) {
- if (lhs_vert < m_size)
- output(lhs_vert, horiz_base + i) = results[i].x;
- if (lhs_vert + 1 < m_size)
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
- if (lhs_vert + 2 < m_size)
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
- if (lhs_vert + 3 < m_size)
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
- }
- }
- }
-}
-
-
-template<typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256, 1)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output,
- const Index m_size, const Index n_size, const Index k_size) {
- __shared__ float2 lhs_shmem[64*32];
- __shared__ float2 rhs_shmem[128*8];
-
- typedef float2 LHS_MEM[64][32];
- typedef float2 RHS_MEM[128][8];
-
- typedef float2 LHS_MEM16x16[32][16];
- typedef float2 RHS_MEM16x16[64][8];
-
- const Index m_block_idx = hipBlockIdx_x;
- const Index n_block_idx = hipBlockIdx_y;
-
- const Index base_m = 128 * m_block_idx;
- const Index base_n = 64 * n_block_idx;
-
- bool check_rhs = (base_n + 63) >= n_size;
- bool check_lhs128 = (base_m + 127) >= m_size;
-
- if (!check_rhs) {
- if (!check_lhs128) {
- // >= 128 rows left
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
- } else {
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
- }
- } else {
- if (!check_lhs128) {
- // >= 128 rows left
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
- } else {
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
- }
- }
-}
-
-template<typename Index, typename LhsMapper,
- typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256, 1)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
- const OutputMapper output,
- const Index m_size, const Index n_size, const Index k_size) {
- __shared__ float2 lhs_shmem[32][16];
- __shared__ float2 rhs_shmem[64][8];
-
- const Index m_block_idx = hipBlockIdx_x;
- const Index n_block_idx = hipBlockIdx_y;
-
- const Index base_m = 64 * m_block_idx;
- const Index base_n = 64 * n_block_idx;
-
- if (base_m + 63 < m_size) {
- if (base_n + 63 < n_size) {
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
- } else {
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
- }
- } else {
- if (base_n + 63 < n_size) {
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
- } else {
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
- }
- }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
- typedef GpuDevice Device;
-
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
- typedef TensorContractionEvaluatorBase<Self> Base;
-
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
- typedef typename XprType::Index Index;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
- enum {
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- };
-
- // Most of the code is assuming that both input tensors are ColMajor. If the
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
- // will pretend B is LHS and A is RHS.
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
- static const int LDims =
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
- static const int RDims =
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
- static const int ContractDims = internal::array_size<Indices>::value;
-
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
-
- typedef array<Index, ContractDims> contract_t;
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
- static const int NumDims = LDims + RDims - 2 * ContractDims;
-
- typedef DSizes<Index, NumDims> Dimensions;
-
- // typedefs needed in evalTo
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
- typedef typename RightEvaluator::Dimensions RightDimensions;
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
- Base(op, device) {}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
- // We need to redefine this method to make hipcc happy
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- evalTo(data);
- return false;
- } else {
- this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
- evalTo(this->m_result);
- return true;
- }
- }
-
- void evalTo(Scalar* buffer) const {
- if (this->m_lhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, false, false, Unaligned>(buffer);
- }
- }
- }
- else {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, false, false, Unaligned>(buffer);
- }
- }
- }
- }
-
- template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
- const Index m_blocks = (m + 63) / 64;
- const Index n_blocks = (n + 63) / 64;
- const dim3 num_blocks(m_blocks, n_blocks, 1);
- const dim3 block_size(8, 8, 8);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
- }
- };
-
- template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
- if (m < 768 || n < 768) {
- const Index m_blocks = (m + 63) / 64;
- const Index n_blocks = (n + 63) / 64;
- const dim3 num_blocks(m_blocks, n_blocks, 1);
- const dim3 block_size(16, 16, 1);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
- } else {
- const Index m_blocks = (m + 127) / 128;
- const Index n_blocks = (n + 63) / 64;
- const dim3 num_blocks(m_blocks, n_blocks, 1);
- const dim3 block_size(8, 32, 1);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
- }
- }
- };
-
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalTyped(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
- EIGEN_UNUSED_VARIABLE(k)
-
- // rows in left side
- const Index m = this->m_i_size;
-
- // columns in right side
- const Index n = this->m_j_size;
-
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- LeftEvaluator, left_nocontract_t,
- contract_t, 4,
- lhs_inner_dim_contiguous,
- false, Unaligned> LhsMapper;
-
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- RightEvaluator, right_nocontract_t,
- contract_t, 4,
- rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
- typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
- // initialize data mappers
- LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
- this->m_left_contracting_strides, this->m_k_strides);
-
- RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
- this->m_right_contracting_strides, this->m_k_strides);
-
- OutputMapper output(buffer, m);
-
- setHipSharedMemConfig(hipSharedMemBankSizeEightByte);
- LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
- }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and EIGEN_HIPCC
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
deleted file mode 100644
index ba9971050..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
+++ /dev/null
@@ -1,1119 +0,0 @@
-//#include "hip/hip_runtime.h"
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-namespace internal {
-
-template <typename Index, typename InputDims, int NumKernelDims, int Layout>
-class IndexMapper {
- public:
- IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
- const array<Index, NumKernelDims>& indices) {
-
- array<Index, NumDims> dimensions = input_dims;
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = indices[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- dimensions[index] = result_dim;
- }
-
- array<Index, NumDims> inputStrides;
- array<Index, NumDims> outputStrides;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- inputStrides[0] = 1;
- outputStrides[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
- outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
- }
- } else {
- inputStrides[NumDims - 1] = 1;
- outputStrides[NumDims - 1] = 1;
- for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
- inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
- outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
- }
- }
-
- array<Index, NumDims> hipInputDimensions;
- array<Index, NumDims> hipOutputDimensions;
- array<Index, NumDims> tmp = dimensions;
- array<Index, NumDims> ordering;
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = i + offset;
- ordering[index] = indices[i];
- tmp[indices[i]] = -1;
- hipInputDimensions[index] = input_dims[indices[i]];
- hipOutputDimensions[index] = dimensions[indices[i]];
- }
-
- int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? NumKernelDims
- : 0;
- for (int i = 0; i < NumDims; ++i) {
- if (tmp[i] >= 0) {
- ordering[written] = i;
- hipInputDimensions[written] = input_dims[i];
- hipOutputDimensions[written] = dimensions[i];
- ++written;
- }
- }
-
- for (int i = 0; i < NumDims; ++i) {
- m_inputStrides[i] = inputStrides[ordering[i]];
- m_outputStrides[i] = outputStrides[ordering[i]];
- }
-
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = 0; i < NumDims; ++i) {
- if (i > NumKernelDims) {
- m_hipInputStrides[i] =
- m_hipInputStrides[i - 1] * hipInputDimensions[i - 1];
- m_hipOutputStrides[i] =
- m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1];
- } else {
- m_hipInputStrides[i] = 1;
- m_hipOutputStrides[i] = 1;
- }
- }
- } else {
- for (int i = NumDims - 1; i >= 0; --i) {
- if (i + 1 < offset) {
- m_hipInputStrides[i] =
- m_hipInputStrides[i + 1] * hipInputDimensions[i + 1];
- m_hipOutputStrides[i] =
- m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1];
- } else {
- m_hipInputStrides[i] = 1;
- m_hipOutputStrides[i] = 1;
- }
- }
- }
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const {
- Index inputIndex = 0;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int d = NumDims - 1; d > NumKernelDims; --d) {
- const Index idx = p / m_hipInputStrides[d];
- inputIndex += idx * m_inputStrides[d];
- p -= idx * m_hipInputStrides[d];
- }
- inputIndex += p * m_inputStrides[NumKernelDims];
- } else {
- std::ptrdiff_t limit = 0;
- if (NumKernelDims < NumDims) {
- limit = NumDims - NumKernelDims - 1;
- }
- for (int d = 0; d < limit; ++d) {
- const Index idx = p / m_hipInputStrides[d];
- inputIndex += idx * m_inputStrides[d];
- p -= idx * m_hipInputStrides[d];
- }
- inputIndex += p * m_inputStrides[limit];
- }
- return inputIndex;
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const {
- Index outputIndex = 0;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int d = NumDims - 1; d > NumKernelDims; --d) {
- const Index idx = p / m_hipOutputStrides[d];
- outputIndex += idx * m_outputStrides[d];
- p -= idx * m_hipOutputStrides[d];
- }
- outputIndex += p * m_outputStrides[NumKernelDims];
- } else {
- std::ptrdiff_t limit = 0;
- if (NumKernelDims < NumDims) {
- limit = NumDims - NumKernelDims - 1;
- }
- for (int d = 0; d < limit; ++d) {
- const Index idx = p / m_hipOutputStrides[d];
- outputIndex += idx * m_outputStrides[d];
- p -= idx * m_hipOutputStrides[d];
- }
- outputIndex += p * m_outputStrides[limit];
- }
- return outputIndex;
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_inputStrides[offset];
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_outputStrides[offset];
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
- k * m_inputStrides[offset + 2];
- }
-
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
- const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumDims - NumKernelDims;
- return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
- k * m_outputStrides[offset + 2];
- }
-
- private:
- static const int NumDims = internal::array_size<InputDims>::value;
- array<Index, NumDims> m_inputStrides;
- array<Index, NumDims> m_outputStrides;
- array<Index, NumDims> m_hipInputStrides;
- array<Index, NumDims> m_hipOutputStrides;
-};
-
-
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
-{
- // Type promotion to handle the case where the types of the lhs and the rhs are different.
- typedef typename promote_storage_type<typename InputXprType::Scalar,
- typename KernelXprType::Scalar>::ret Scalar;
- typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
- typename traits<KernelXprType>::StorageKind>::ret StorageKind;
- typedef typename promote_index_type<typename traits<InputXprType>::Index,
- typename traits<KernelXprType>::Index>::type Index;
- typedef typename InputXprType::Nested LhsNested;
- typedef typename KernelXprType::Nested RhsNested;
- typedef typename remove_reference<LhsNested>::type _LhsNested;
- typedef typename remove_reference<RhsNested>::type _RhsNested;
- static const int NumDimensions = traits<InputXprType>::NumDimensions;
- static const int Layout = traits<InputXprType>::Layout;
-
- enum {
- Flags = 0
- };
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
-{
- typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
-{
- typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
-};
-
-} // end namespace internal
-
-
-
-template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
-{
- public:
- typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
- typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
- typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
- typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
- typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
- typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
- typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
- : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const Indices& indices() const { return m_indices; }
-
- /** \returns the nested expressions */
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const typename internal::remove_all<typename InputXprType::Nested>::type&
- inputExpression() const { return m_input_xpr; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const typename internal::remove_all<typename KernelXprType::Nested>::type&
- kernelExpression() const { return m_kernel_xpr; }
-
- protected:
- typename InputXprType::Nested m_input_xpr;
- typename KernelXprType::Nested m_kernel_xpr;
- const Indices m_indices;
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
-{
- typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
- static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
- static const int NumKernelDims = internal::array_size<Indices>::value;
- typedef typename XprType::Index Index;
- typedef DSizes<Index, NumDims> Dimensions;
-
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
- enum {
- IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
- Layout = TensorEvaluator<InputArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
- };
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
- {
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
- const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- m_inputStride[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
- }
- } else {
- m_inputStride[NumDims - 1] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
- }
- }
-
- m_dimensions = m_inputImpl.dimensions();
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = op.indices()[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- m_dimensions[index] = result_dim;
- if (i > 0) {
- m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
- } else {
- m_kernelStride[0] = 1;
- }
- m_indexStride[i] = m_inputStride[index];
- }
-
- m_outputStride[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
- }
- } else {
- for (int i = NumKernelDims - 1; i >= 0; --i) {
- const Index index = op.indices()[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- m_dimensions[index] = result_dim;
- if (i < NumKernelDims - 1) {
- m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
- } else {
- m_kernelStride[NumKernelDims - 1] = 1;
- }
- m_indexStride[i] = m_inputStride[index];
- }
-
- m_outputStride[NumDims - 1] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
- }
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
- m_inputImpl.evalSubExprsIfNeeded(NULL);
- preloadKernel();
- return true;
- }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- m_inputImpl.cleanup();
- if (m_local_kernel) {
- m_device.deallocate((void*)m_kernel);
- m_local_kernel = false;
- }
- m_kernel = NULL;
- }
-
- void evalTo(typename XprType::Scalar* buffer) {
- evalSubExprsIfNeeded(NULL);
- for (int i = 0; i < dimensions().TotalSize(); ++i) {
- buffer[i] += coeff(i);
- }
- cleanup();
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
- {
- CoeffReturnType result = CoeffReturnType(0);
- convolve(firstInput(index), 0, NumKernelDims-1, result);
- return result;
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
- {
- Index indices[2] = {index, index+PacketSize-1};
- Index startInputs[2] = {0, 0};
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = NumDims - 1; i > 0; --i) {
- const Index idx0 = indices[0] / m_outputStride[i];
- const Index idx1 = indices[1] / m_outputStride[i];
- startInputs[0] += idx0 * m_inputStride[i];
- startInputs[1] += idx1 * m_inputStride[i];
- indices[0] -= idx0 * m_outputStride[i];
- indices[1] -= idx1 * m_outputStride[i];
- }
- } else {
- for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx0 = indices[0] / m_outputStride[i];
- const Index idx1 = indices[1] / m_outputStride[i];
- startInputs[0] += idx0 * m_inputStride[i];
- startInputs[1] += idx1 * m_inputStride[i];
- indices[0] -= idx0 * m_outputStride[i];
- indices[1] -= idx1 * m_outputStride[i];
- }
- }
- startInputs[0] += indices[0];
- startInputs[1] += indices[1];
-
- if (startInputs[1]-startInputs[0] == PacketSize-1) {
- PacketReturnType result = internal::pset1<PacketReturnType>(0);
- convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
- return result;
- } else {
- EIGEN_ALIGN_MAX Scalar data[PacketSize];
- data[0] = Scalar(0);
- convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
- for (int i = 1; i < PacketSize-1; ++i) {
- data[i] = Scalar(0);
- convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
- }
- data[PacketSize-1] = Scalar(0);
- convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
- return internal::pload<PacketReturnType>(data);
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
- costPerCoeff(bool vectorized) const {
- const double kernel_size = m_kernelImpl.dimensions().TotalSize();
- // We ignore the use of fused multiply-add.
- const double convolve_compute_cost =
- TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
- const double firstIndex_compute_cost =
- NumDims *
- (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
- TensorOpCost::DivCost<Index>());
- return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
- kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
- m_kernelImpl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, convolve_compute_cost, vectorized,
- PacketSize));
- }
-
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
- private:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
- Index startInput = 0;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = index / m_outputStride[i];
- startInput += idx * m_inputStride[i];
- index -= idx * m_outputStride[i];
- }
- } else {
- for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx = index / m_outputStride[i];
- startInput += idx * m_inputStride[i];
- index -= idx * m_outputStride[i];
- }
- }
- startInput += index;
- return startInput;
- }
-
- EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
- for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
- const Index input = firstIndex + j * m_indexStride[DimIndex];
- const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
- if (DimIndex > 0) {
- convolve(input, kernel, DimIndex-1, accum);
- } else {
- accum += m_inputImpl.coeff(input) * m_kernel[kernel];
- }
- }
- }
-
- template <typename Packet>
- EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
- for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
- const Index input = firstIndex + j * m_indexStride[DimIndex];
- const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
- if (DimIndex > 0) {
- convolvePacket(input, kernel, DimIndex-1, accum);
- } else {
- accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
- }
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
- // Don't make a local copy of the kernel unless we have to (i.e. it's an
- // expression that needs to be evaluated)
- const Scalar* in_place = m_kernelImpl.data();
- if (in_place) {
- m_kernel = in_place;
- m_local_kernel = false;
- } else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
- typedef TensorEvalToOp<const KernelArgType> EvalTo;
- EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
-
- m_kernel = local;
- m_local_kernel = true;
- }
- }
-
- array<Index, NumDims> m_inputStride;
- array<Index, NumDims> m_outputStride;
-
- array<Index, NumKernelDims> m_indexStride;
- array<Index, NumKernelDims> m_kernelStride;
- TensorEvaluator<InputArgType, Device> m_inputImpl;
- TensorEvaluator<KernelArgType, Device> m_kernelImpl;
- Dimensions m_dimensions;
-
- KernelArgType m_kernelArg;
- const Scalar* m_kernel;
- bool m_local_kernel;
- const Device& m_device;
-};
-
-
-
-
-// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-
-template <int StaticKernelSize>
-struct GetKernelSize {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
- return StaticKernelSize;
- }
-};
-template <>
-struct GetKernelSize<Dynamic> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
- return kernelSize;
- }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
- int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
- InputEvaluator eval,
- const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
- indexMapper,
- const float* __restrict kernel, const int numPlanes, const int numX,
- const int maxX, const int kernelSize, float* buffer) {
- HIP_DYNAMIC_SHARED( float, s)
-
- const int first_x = hipBlockIdx_x * maxX;
- const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
- const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
- const int num_x_output = last_x - first_x + 1;
-
- const int first_plane = hipBlockIdx_y * hipBlockDim_y;
- const int plane_stride = hipBlockDim_y * hipGridDim_y;
-
- for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) {
- // Load inputs to shared memory
- const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
- const int plane_kernel_offset = hipThreadIdx_y * num_x_input;
- #pragma unroll
- for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
- const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x);
- s[i + plane_kernel_offset] = eval.coeff(tensor_index);
- }
-
- __syncthreads();
-
- // Compute the convolution
- const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
- #pragma unroll
- for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
- const int kernel_offset = plane_kernel_offset + i;
- float result = 0.0f;
- #pragma unroll
- for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
- result += s[k + kernel_offset] * kernel[k];
- }
- const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x);
- buffer[tensor_index] = result;
- }
- __syncthreads();
- }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
- int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
- InputEvaluator eval,
- const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
- indexMapper,
- const float* __restrict kernel, const int numPlanes, const int numX,
- const int maxX, const int numY, const int maxY, const int kernelSizeX,
- const int kernelSizeY, float* buffer) {
- HIP_DYNAMIC_SHARED( float, s)
-
- const int first_x = hipBlockIdx_x * maxX;
- const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
- const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
- const int num_x_output = last_x - first_x + 1;
-
- const int first_y = hipBlockIdx_y * maxY;
- const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
- const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
- const int num_y_output = last_y - first_y + 1;
-
- const int first_plane = hipBlockIdx_z * hipBlockDim_z;
- const int plane_stride = hipBlockDim_z * hipGridDim_z;
-
- for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) {
-
- const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
- const int plane_kernel_offset = hipThreadIdx_z * num_y_input;
-
- // Load inputs to shared memory
- #pragma unroll
- for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
- const int input_offset = num_x_input * (j + plane_kernel_offset);
- #pragma unroll
- for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
- const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y);
- s[i + input_offset] = eval.coeff(tensor_index);
- }
- }
-
- __syncthreads();
-
- // Convolution
- const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
- #pragma unroll
- for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
- #pragma unroll
- for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
- float result = 0.0f;
- #pragma unroll
- for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
- const int kernel_offset = kernelSizeX * l;
- const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
- #pragma unroll
- for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
- result += s[k + input_offset] * kernel[k + kernel_offset];
- }
- }
- const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
- buffer[tensor_index] = result;
- }
- }
-
- __syncthreads();
- }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
- InputEvaluator eval,
- const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
- indexMapper,
- const float* __restrict kernel, const size_t numPlanes, const size_t numX,
- const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
- const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
- const size_t kernelSizeZ, float* buffer) {
- HIP_DYNAMIC_SHARED( float, s)
-
- // Load inputs to shared memory
- const int first_x = hipBlockIdx_x * maxX;
- const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
- const int num_x_input = last_x - first_x + kernelSizeX;
-
- const int first_y = hipBlockIdx_y * maxY;
- const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
- const int num_y_input = last_y - first_y + kernelSizeY;
-
- const int first_z = hipBlockIdx_z * maxZ;
- const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
- const int num_z_input = last_z - first_z + kernelSizeZ;
-
- for (int p = 0; p < numPlanes; ++p) {
-
- const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
- const int plane_kernel_offset = 0;
-
- for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) {
- for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
- for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
- const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
- s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
- }
- }
- }
-
- __syncthreads();
-
- // Convolution
- const int num_z_output = last_z - first_z + 1;
- const int num_y_output = last_y - first_y + 1;
- const int num_x_output = last_x - first_x + 1;
- const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
-
- for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) {
- for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
- for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
- float result = 0.0f;
- for (int n = 0; n < kernelSizeZ; ++n) {
- for (int m = 0; m < kernelSizeY; ++m) {
- for (int l = 0; l < kernelSizeX; ++l) {
- result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
- }
- }
- }
- const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
- buffer[tensor_index] = result;
- }
- }
- }
- __syncthreads();
- }
-};
-
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
-{
- typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
- static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
- static const int NumKernelDims = internal::array_size<Indices>::value;
- typedef typename XprType::Index Index;
- typedef DSizes<Index, NumDims> Dimensions;
- typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
-
- enum {
- IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
- PacketAccess = false,
- Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
- };
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
- {
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
- const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
- m_dimensions = m_inputImpl.dimensions();
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = op.indices()[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- m_dimensions[index] = result_dim;
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
-
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
- typedef typename InputArgType::Scalar Scalar;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
- EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- preloadKernel();
- m_inputImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- executeEval(data);
- return false;
- } else {
- m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
- executeEval(m_buf);
- return true;
- }
- }
-
- EIGEN_STRONG_INLINE void cleanup() {
- m_inputImpl.cleanup();
- if (m_buf) {
- m_device.deallocate(m_buf);
- m_buf = NULL;
- }
- if (m_local_kernel) {
- m_device.deallocate((void*)m_kernel);
- m_local_kernel = false;
- }
- m_kernel = NULL;
- }
-
- EIGEN_STRONG_INLINE void preloadKernel() {
- // Don't make a local copy of the kernel unless we have to (i.e. it's an
- // expression that needs to be evaluated)
- const Scalar* in_place = m_kernelImpl.data();
- if (in_place) {
- m_kernel = in_place;
- m_local_kernel = false;
- } else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
- typedef TensorEvalToOp<const KernelArgType> EvalTo;
- EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
-
- m_kernel = local;
- m_local_kernel = true;
- }
- }
-
- static unsigned int ceil(unsigned int num, unsigned int denom) {
- const unsigned int rounded_toward_zero = num / denom;
- if (num > rounded_toward_zero * denom) {
- return rounded_toward_zero + 1;
- }
- return rounded_toward_zero;
- }
-
- void executeEval(Scalar* data) const {
- typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
-
- const int maxSharedMem = m_device.sharedMemPerBlock();
- const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock();
- const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock;
- const int numMultiProcessors = m_device.getNumHipMultiProcessors();
- const int hipWarpSize = 32;
-
- switch (NumKernelDims) {
- case 1: {
- const int kernel_size = m_kernelImpl.dimensions().TotalSize();
-
- const int numX = dimensions()[m_indices[0]];
- const int numP = dimensions().TotalSize() / numX;
- int maxX;
- dim3 block_size;
-
- const int single_stride_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : m_inputImpl.dimensions().rank() - 1;
- if (m_indices[0] == single_stride_dim) {
- // Maximum the reuse
- const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
- maxX = numext::mini<int>(inner_dim, numX);
- const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
- block_size.x = numext::mini(maxThreadsPerBlock, maxX);
- block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
- }
- else {
- // Read as much as possible alongside the inner most dimension, that is the plane
- const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar));
- const int maxP = numext::mini<int>(inner_dim, numP);
- maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
-
- block_size.x = numext::mini(hipWarpSize, maxX);
- block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
- }
-
- const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
-
- const int num_x_blocks = ceil(numX, maxX);
- const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
- const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
-
- dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
-
-
- //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
- const array<Index, 1> indices(m_indices[0]);
- const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
- internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
- m_inputImpl.dimensions(), kernel_dims, indices);
- switch(kernel_size) {
- case 4: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
- break;
- }
- case 7: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
- break;
- }
- default: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
- }
- }
- break;
- }
-
- case 2: {
- const int idxX =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
- const int idxY =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
- const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
-
- const int numX = dimensions()[m_indices[idxX]];
- const int numY = dimensions()[m_indices[idxY]];
- const int numP = dimensions().TotalSize() / (numX*numY);
-
- const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
-
- // Snap maxX to warp size
- int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
- const int maxX = numext::mini<int>(inner_dim, numX);
- const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
- const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
-
- dim3 block_size;
- block_size.x = numext::mini(1024, maxX);
- block_size.y = numext::mini<int>(1024/block_size.x, maxY);
- block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
-
- const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
-
- const int num_x_blocks = ceil(numX, maxX);
- const int num_y_blocks = ceil(numY, maxY);
- const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
- const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
-
- dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
-
-
- //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
- const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
- const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
- m_kernelImpl.dimensions()[idxY]);
- internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
- m_inputImpl.dimensions(), kernel_dims, indices);
- switch (kernel_size_x) {
- case 4: {
- switch (kernel_size_y) {
- case 7: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
- break;
- }
- default: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
- break;
- }
- }
- break;
- }
- case 7: {
- switch (kernel_size_y) {
- case 4: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
- break;
- }
- default: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
- break;
- }
- }
- break;
- }
- default: {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
- break;
- }
- }
- break;
- }
-
- case 3: {
- const int idxX =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
- const int idxY =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
- const int idxZ =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
-
- const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
-
- const int numX = dimensions()[m_indices[idxX]];
- const int numY = dimensions()[m_indices[idxY]];
- const int numZ = dimensions()[m_indices[idxZ]];
- const int numP = dimensions().TotalSize() / (numX*numY*numZ);
-
- const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
- const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
- const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
-
- dim3 block_size;
- block_size.x = numext::mini(32, maxX);
- block_size.y = numext::mini(32, maxY);
- block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
- dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
-
- const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
-
- //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
- const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
- m_indices[idxZ]);
- const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
- m_kernelImpl.dimensions()[idxY],
- m_kernelImpl.dimensions()[idxZ]);
- internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
- m_inputImpl.dimensions(), kernel_dims, indices);
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
- dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel,
- numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
- break;
- }
-
- default: {
- EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
- }
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return m_buf[index];
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
- costPerCoeff(bool vectorized) const {
- // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
- // model.
- const double kernel_size = m_kernelImpl.dimensions().TotalSize();
- // We ignore the use of fused multiply-add.
- const double convolve_compute_cost =
- TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
- const double firstIndex_compute_cost =
- NumDims *
- (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
- TensorOpCost::DivCost<Index>());
- return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
- kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
- m_kernelImpl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, convolve_compute_cost, vectorized,
- PacketSize));
- }
-
- private:
- // No assignment (copies are needed by the kernels)
- TensorEvaluator& operator = (const TensorEvaluator&);
-
- TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
- TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
- KernelArgType m_kernelArg;
- Indices m_indices;
- Dimensions m_dimensions;
- Scalar* m_buf;
- const Scalar* m_kernel;
- bool m_local_kernel;
-
- const GpuDevice& m_device;
-};
-#endif
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
deleted file mode 100644
index c0e110987..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
+++ /dev/null
@@ -1,352 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
-
-#if defined(EIGEN_HIPCC)
-#include "hip/hip_runtime.h"
-#include "hip/hip_runtime_api.h"
-#endif
-#include <unistd.h> //for sleep function
-
-namespace Eigen {
-
-static const int kHipScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// HIP streams underneath.
-class StreamInterface {
- public:
- virtual ~StreamInterface() {}
-
- virtual const hipStream_t& stream() const = 0;
- virtual const hipDeviceProp_t& deviceProperties() const = 0;
-
- // Allocate memory on the actual device where the computation will run
- virtual void* allocate(size_t num_bytes) const = 0;
- virtual void deallocate(void* buffer) const = 0;
-
- // Return a scratchpad buffer of size 1k
- virtual void* scratchpad() const = 0;
-
- // Return a semaphore. The semaphore is initially initialized to 0, and
- // each kernel using it is responsible for resetting to 0 upon completion
- // to maintain the invariant that the semaphore is always equal to 0 upon
- // each kernel start.
- virtual unsigned int* semaphore() const = 0;
-};
-
-static hipDeviceProp_t* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
- if (!m_devicePropInitialized) {
- // Attempts to ensure proper behavior in the case of multiple threads
- // calling this function simultaneously. This would be trivial to
- // implement if we could use std::mutex, but unfortunately mutex don't
- // compile with nvcc, so we resort to atomics and thread fences instead.
- // Note that if the caller uses a compiler that doesn't support c++11 we
- // can't ensure that the initialization is thread safe.
-#if 0 && __cplusplus >= 201103L
- static std::atomic<bool> first(true);
- if (first.exchange(false)) {
-#else
- static bool first = true;
- if (first) {
- first = false;
-#endif
- // We're the first thread to reach this point.
- int num_devices;
- hipError_t status = hipGetDeviceCount(&num_devices);
- if (status != hipSuccess) {
- std::cerr << "Failed to get the number of HIP devices: "
- << hipGetErrorString(status)
- << std::endl;
- assert(status == hipSuccess);
- }
- m_deviceProperties = new hipDeviceProp_t[num_devices];
- for (int i = 0; i < num_devices; ++i) {
- status = hipGetDeviceProperties(&m_deviceProperties[i], i);
- if (status != hipSuccess) {
- std::cerr << "Failed to initialize HIP device #"
- << i
- << ": "
- << hipGetErrorString(status)
- << std::endl;
- assert(status == hipSuccess);
- }
- }
-
-#if 0 && __cplusplus >= 201103L
- std::atomic_thread_fence(std::memory_order_release);
-#endif
- m_devicePropInitialized = true;
- } else {
- // Wait for the other thread to inititialize the properties.
- while (!m_devicePropInitialized) {
-#if 0 && __cplusplus >= 201103L
- std::atomic_thread_fence(std::memory_order_acquire);
-#endif
- sleep(1);
- }
- }
- }
-}
-
-static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
-
-class HipStreamDevice : public StreamInterface {
- public:
- // Use the default stream on the current device
- HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
- hipGetDevice(&device_);
- initializeDeviceProp();
- }
- // Use the default stream on the specified device
- HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
- initializeDeviceProp();
- }
- // Use the specified stream. Note that it's the
- // caller responsibility to ensure that the stream can run on
- // the specified device. If no device is specified the code
- // assumes that the stream is associated to the current gpu device.
- HipStreamDevice(const hipStream_t* stream, int device = -1)
- : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
- if (device < 0) {
- hipGetDevice(&device_);
- } else {
- int num_devices;
- hipError_t err = hipGetDeviceCount(&num_devices);
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- assert(device < num_devices);
- device_ = device;
- }
- initializeDeviceProp();
- }
-
- virtual ~HipStreamDevice() {
- if (scratch_) {
- deallocate(scratch_);
- }
- }
-
- const hipStream_t& stream() const { return *stream_; }
- const hipDeviceProp_t& deviceProperties() const {
- return m_deviceProperties[device_];
- }
- virtual void* allocate(size_t num_bytes) const {
- hipError_t err = hipSetDevice(device_);
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- void* result;
- err = hipMalloc(&result, num_bytes);
- assert(err == hipSuccess);
- assert(result != NULL);
- return result;
- }
- virtual void deallocate(void* buffer) const {
- hipError_t err = hipSetDevice(device_);
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- assert(buffer != NULL);
- err = hipFree(buffer);
- assert(err == hipSuccess);
- }
-
- virtual void* scratchpad() const {
- if (scratch_ == NULL) {
- scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
- }
- return scratch_;
- }
-
- virtual unsigned int* semaphore() const {
- if (semaphore_ == NULL) {
- char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
- semaphore_ = reinterpret_cast<unsigned int*>(scratch);
- //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
- hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- }
- return semaphore_;
- }
-
- private:
- const hipStream_t* stream_;
- int device_;
- mutable void* scratch_;
- mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
- // The StreamInterface is not owned: the caller is
- // responsible for its initialization and eventual destruction.
- explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
- eigen_assert(stream);
- }
- explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
- eigen_assert(stream);
- }
- // TODO(bsteiner): This is an internal API, we should not expose it.
- EIGEN_STRONG_INLINE const hipStream_t& stream() const {
- return stream_->stream();
- }
-
- EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
- return stream_->allocate(num_bytes);
- }
-
- EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
- stream_->deallocate(buffer);
- }
-
- EIGEN_STRONG_INLINE void* scratchpad() const {
- return stream_->scratchpad();
- }
-
- EIGEN_STRONG_INLINE unsigned int* semaphore() const {
- return stream_->semaphore();
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
- hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
- stream_->stream());
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
- }
-
- EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
- hipError_t err =
- hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- }
-
- EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
- hipError_t err =
- hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
- //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
- hipError_t err = hipMemset(buffer, c, n);
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == hipSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
- }
-
- EIGEN_STRONG_INLINE size_t numThreads() const {
- // FIXME
- return 32;
- }
-
- EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
- // FIXME
- return 48*1024;
- }
-
- EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
- // We won't try to take advantage of the l2 cache for the time being, and
- // there is no l3 cache on hip devices.
- return firstLevelCacheSize();
- }
-
-// FIXME - this will move into HIP
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-#undef assert
-#define assert(COND)
-#endif
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(EIGEN_HIPCC) && \
- !defined(EIGEN_HIP_DEVICE_COMPILE)
- hipError_t err = hipStreamSynchronize(stream_->stream());
- if (err != hipSuccess) {
- std::cerr << "Error detected in HIP stream: "
- << hipGetErrorString(err)
- << std::endl;
- assert(err == hipSuccess);
- }
-#else
- assert(false && "The default device should be used instead to generate kernel code");
-#endif
- }
-
- EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
- return stream_->deviceProperties().multiProcessorCount;
- }
- EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
- return stream_->deviceProperties().maxThreadsPerBlock;
- }
- EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
- return stream_->deviceProperties().maxThreadsPerMultiProcessor;
- }
- EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
- return stream_->deviceProperties().sharedMemPerBlock;
- }
- EIGEN_STRONG_INLINE int majorDeviceVersion() const {
- return stream_->deviceProperties().major;
- }
- EIGEN_STRONG_INLINE int minorDeviceVersion() const {
- return stream_->deviceProperties().minor;
- }
-
- EIGEN_STRONG_INLINE int maxBlocks() const {
- return max_blocks_;
- }
-
- // This function checks if the HIP runtime recorded an error for the
- // underlying stream device.
- inline bool ok() const {
-#if defined(EIGEN_HIPCC)
- hipError_t error = hipStreamQuery(stream_->stream());
- return (error == hipSuccess) || (error == hipErrorNotReady);
-#else
- return false;
-#endif
- }
-
- private:
- const StreamInterface* stream_;
- int max_blocks_;
-};
-
-#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
- hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
- assert(hipGetLastError() == hipSuccess);
-
-
-// FIXME: Should be device and kernel specific.
-#if defined(EIGEN_HIPCC)
-static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
-#if !defined(EIGEN_HIP_DEVICE_COMPILE)
- hipError_t status = hipDeviceSetSharedMemConfig(config);
- EIGEN_UNUSED_VARIABLE(status)
- assert(status == hipSuccess);
-#else
- EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
deleted file mode 100644
index 5304a22c5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
+++ /dev/null
@@ -1,815 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
-#endif
-
-#if defined(EIGEN_HIPCC)
-#define HIP_WARP_SIZE 64
-#endif
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple hip thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another hip thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
- if (sizeof(T) == 4)
- {
- unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
- unsigned int newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- unsigned int readback;
- while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
- oldval = readback;
- newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- }
- }
- else if (sizeof(T) == 8) {
- unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
- unsigned long long newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- unsigned long long readback;
- while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
- oldval = readback;
- newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- }
- }
- else {
- assert(0 && "Wordsize not supported");
- }
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
- return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
- unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
- return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
- unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
- unsigned int newval = oldval;
- reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
- if (newval == oldval) {
- return;
- }
- unsigned int readback;
- while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
- oldval = readback;
- newval = oldval;
- reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
- if (newval == oldval) {
- return;
- }
- }
-}
-#endif
-
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
- defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
- atomicAdd(output, accum);
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
- const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
- const Index num_threads = hipBlockDim_x * hipGridDim_x;
- for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
- output[i] = val;
- }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void FullReductionKernel(const Self input, Index num_coeffs,
- typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
- defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
- // Initialize the output value
- const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
- if (hipGridDim_x == 1) {
- if (first_index == 0) {
- *output = reducer.initialize();
- }
- }
- else {
- if (hipThreadIdx_x == 0) {
- unsigned int block = atomicCAS(semaphore, 0u, 1u);
- if (block == 0) {
- // We're the first block to run, initialize the output value
- atomicExchCustom(output, reducer.initialize());
- __threadfence();
- atomicExch(semaphore, 2u);
- }
- else {
- // Wait for the first block to initialize the output value.
- // Use atomicCAS here to ensure that the reads aren't cached
- unsigned int val;
- do {
- val = atomicCAS(semaphore, 2u, 2u);
- }
- while (val < 2u);
- }
- }
- }
-
- __syncthreads();
-
- eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
-
- typename Self::CoeffReturnType accum = reducer.initialize();
- Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
- for (Index i = 0; i < max_iter; i+=BlockSize) {
- const Index index = first_index + i;
- eigen_assert(index < num_coeffs);
- typename Self::CoeffReturnType val = input.m_impl.coeff(index);
- reducer.reduce(val, &accum);
- }
-
-#pragma unroll
- for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
- // XXX use std::is_floating_point to determine the type of accum
- if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
- reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
- } else {
- reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
- }
- }
-
- if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
- atomicReduce(output, accum, reducer);
- }
-
- if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
- // Let the last block reset the semaphore
- atomicInc(semaphore, hipGridDim_x + 1);
- __threadfence_system();
- }
-
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self,
- typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
- eigen_assert(hipBlockDim_x == 1);
- eigen_assert(hipGridDim_x == 1);
- if (num_coeffs % 2 != 0) {
- half last = input.m_impl.coeff(num_coeffs-1);
- *scratch = __halves2half2(last, reducer.initialize());
- } else {
- *scratch = reducer.template initializePacket<half2>();
- }
-}
-
-template <typename Self,
- typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
- const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
- const Index num_threads = hipBlockDim_x * hipGridDim_x;
- const Index num_packets = num_coeffs / 2;
- for (Index i = thread_id; i < num_packets; i += num_threads) {
- ((half2*)output)[i] = reducer.template initializePacket<half2>();
- }
-
- if (thread_id == 0 && num_coeffs % 2 != 0) {
- output[num_coeffs-1] = reducer.initialize();
- }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
- half* output, half2* scratch) {
- eigen_assert(NumPerThread % 2 == 0);
-
- const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
-
- // Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (hipGridDim_x == 1 && first_index == 0) {
- if (num_coeffs % 2 != 0) {
- half last = input.m_impl.coeff(num_coeffs-1);
- *scratch = __halves2half2(last, reducer.initialize());
- } else {
- *scratch = reducer.template initializePacket<half2>();
- }
- __syncthreads();
- }
-
- half2 accum = reducer.template initializePacket<half2>();
- const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
- for (Index i = 0; i < max_iter; i += BlockSize) {
- const Index index = first_index + 2*i;
- eigen_assert(index + 1 < num_coeffs);
- half2 val = input.m_impl.template packet<Unaligned>(index);
- reducer.reducePacket(val, &accum);
- }
-
-#pragma unroll
- for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
- // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
- union { int i; half2 h; } wka_in, wka_out;
- wka_in.h = accum;
- wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
- reducer.reducePacket(wka_out.h, &accum);
- }
-
- if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
- atomicReduce(scratch, accum, reducer);
- }
-
- __syncthreads();
-
- if (hipGridDim_x == 1 && first_index == 0) {
- half tmp = __low2half(*scratch);
- reducer.reduce(__high2half(*scratch), &tmp);
- *output = tmp;
- }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
- eigen_assert(hipThreadIdx_x == 1);
- half tmp = __low2half(*scratch);
- reducer.reduce(__high2half(*scratch), &tmp);
- *output = tmp;
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
- static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
- assert(false && "Should only be called on doubles, floats and half floats");
- }
-};
-
-namespace {
- std::mutex __eigen_reduction_hip_mutex;
-}
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
- Self, Op, OutputType, PacketAccess,
- typename internal::enable_if<
- internal::is_same<float, OutputType>::value ||
- internal::is_same<double, OutputType>::value,
- void>::type> {
- static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
- // guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
- // is dispatched at a time
- std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
-
- typedef typename Self::Index Index;
- typedef typename Self::CoeffReturnType Scalar;
- const int block_size = 256;
- const int num_per_thread = 128;
- const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
- unsigned int* semaphore = NULL;
- if (num_blocks > 1) {
- semaphore = device.semaphore();
-
- unsigned int semaphore_host = 0xFF;
- hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
- if (semaphore_host != 0) {
- std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
- << semaphore_host << "\n";
- // wait for all commands on the device to complete so semaphore value
- // is reset to 0
- hipDeviceSynchronize();
-
- // read again
- hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
- if (semaphore_host != 0) {
- std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
- << semaphore_host << ", apply manual override to 0\n";
-
- // force set semaphore value to be 0
- semaphore_host = 0;
- hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
- }
- }
- }
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
- }
-};
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
- static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
- assert(false && "Should not be called since there is no packet accessor");
- }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
- static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
- typedef typename Self::Index Index;
-
- const int block_size = 256;
- const int num_per_thread = 128;
- const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- half2* scratch = static_cast<half2*>(device.scratchpad());
-
- if (num_blocks > 1) {
- // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
- // won't be a race conditions between multiple thread blocks.
- hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
- dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
- }
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
-
- if (num_blocks > 1) {
- hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
- dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
- }
- }
-};
-#endif
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple cases
- // of doubles, floats and half floats
-#if defined(EIGEN_HAS_HIP_FP16)
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- (internal::is_same<typename Self::CoeffReturnType, float>::value ||
- internal::is_same<typename Self::CoeffReturnType, double>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- (internal::is_same<typename Self::CoeffReturnType, float>::value ||
- internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
- template <typename OutputType>
- static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
- assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- // Don't crash when we're called with an input tensor of size 0.
- if (num_coeffs == 0) {
- return;
- }
-
- FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
- }
-};
-
-
-template <int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
- typename Self::CoeffReturnType* output) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
- defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
- typedef typename Self::CoeffReturnType Type;
- eigen_assert(hipBlockDim_y == 1);
- eigen_assert(hipBlockDim_z == 1);
- eigen_assert(hipGridDim_y == 1);
- eigen_assert(hipGridDim_z == 1);
-
- const int unroll_times = 16;
- eigen_assert(NumPerThread % unroll_times == 0);
-
- const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
- const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
- const Index num_threads = hipBlockDim_x * hipGridDim_x;
- const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
- // Initialize the output values if they weren't initialized by the ReductionInitKernel
- if (hipGridDim_x == 1) {
- for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
- output[i] = reducer.initialize();
- }
- __syncthreads();
- }
-
- for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
- const Index row = i / input_col_blocks;
-
- if (row < num_preserved_coeffs) {
- const Index col_block = i % input_col_blocks;
- const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
-
- Type reduced_val = reducer.initialize();
-
- for (Index j = 0; j < NumPerThread; j += unroll_times) {
- const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
- if (last_col >= num_coeffs_to_reduce) {
- for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
- const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
- reducer.reduce(val, &reduced_val);
- }
- break;
- } else {
- // Faster version of the loop with no branches after unrolling.
-#pragma unroll
- for (int k = 0; k < unroll_times; ++k) {
- const Index col = col_begin + hipBlockDim_x * (j + k);
- reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
- }
- }
- }
-
-#pragma unroll
- for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
- // XXX use std::is_floating_point to determine the type of reduced_val
- if (std::is_floating_point<Type>::value) {
- reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
- } else {
- reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
- }
- }
-
- if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
- atomicReduce(&(output[row]), reduced_val, reducer);
- }
- }
- }
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-#if defined(EIGEN_HAS_HIP_FP16)
-
-template <int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
- half* output) {
- eigen_assert(hipBlockDim_y == 1);
- eigen_assert(hipBlockDim_z == 1);
- eigen_assert(hipGridDim_y == 1);
- eigen_assert(hipGridDim_z == 1);
-
- const int unroll_times = 16;
- eigen_assert(NumPerThread % unroll_times == 0);
- eigen_assert(unroll_times % 2 == 0);
-
- const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
- const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
- const Index num_threads = hipBlockDim_x * hipGridDim_x;
- const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
- // Initialize the output values if they weren't initialized by the ReductionInitKernel
- if (hipGridDim_x == 1) {
- Index i = 2*thread_id;
- for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
- half* loc = output + i;
- *((half2*)loc) = reducer.template initializePacket<half2>();
- }
- if (i < num_preserved_coeffs) {
- output[i] = reducer.initialize();
- }
- __syncthreads();
- }
-
- for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
- const Index row = 2 * (i / input_col_blocks);
-
- if (row + 1 < num_preserved_coeffs) {
- const Index col_block = i % input_col_blocks;
- const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
-
- half2 reduced_val1 = reducer.template initializePacket<half2>();
- half2 reduced_val2 = reducer.template initializePacket<half2>();
-
- for (Index j = 0; j < NumPerThread; j += unroll_times) {
- const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
- if (last_col >= num_coeffs_to_reduce) {
- Index col = col_begin + hipBlockDim_x * j;
- for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
- const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
- reducer.reducePacket(val1, &reduced_val1);
- const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
- reducer.reducePacket(val2, &reduced_val2);
- }
- if (col < num_coeffs_to_reduce) {
- // Peel;
- const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
- const half2 val1 = __halves2half2(last1, reducer.initialize());
- reducer.reducePacket(val1, &reduced_val1);
- const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
- const half2 val2 = __halves2half2(last2, reducer.initialize());
- reducer.reducePacket(val2, &reduced_val2);
- }
- break;
- } else {
- // Faster version of the loop with no branches after unrolling.
-#pragma unroll
- for (int k = 0; k < unroll_times; ++k) {
- const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
- reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
- reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
- }
- }
- }
-
-#pragma unroll
- for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
- // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
- union { int i; half2 h; } wka_in, wka_out;
-
- wka_in.h = reduced_val1;
- wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
- reducer.reducePacket(wka_out.h, &reduced_val1);
-
- wka_in.h = reduced_val2;
- wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
- reducer.reducePacket(wka_out.h, &reduced_val2);
- }
-
- half val1 = __low2half(reduced_val1);
- reducer.reduce(__high2half(reduced_val1), &val1);
- half val2 = __low2half(reduced_val2);
- reducer.reduce(__high2half(reduced_val2), &val2);
- half2 val = __halves2half2(val1, val2);
-
- if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
- half* loc = output + row;
- atomicReduce((half2*)loc, val, reducer);
- }
- }
- }
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
- static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
- return true;
- }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
- Self, Op, OutputType, PacketAccess,
- typename internal::enable_if<
- internal::is_same<float, OutputType>::value ||
- internal::is_same<double, OutputType>::value,
- void>::type> {
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- typedef typename Self::Index Index;
-
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
- const int block_size = 256;
- const int num_per_thread = 128;
- const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / block_size;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
- if (num_blocks > 1) {
- // We initialize the outputs outside the reduction kernel when we can't be sure that there
- // won't be a race conditions between multiple thread blocks.
- const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / 1024;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
- dim3(num_blocks), dim3(1024), 0, device.stream(),
- reducer.initialize(), num_preserved_vals, output);
- }
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
- num_coeffs_to_reduce, num_preserved_vals, output);
-
- return false;
- }
-};
-
-#if defined(EIGEN_HAS_HIP_FP16)
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
- static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
- assert(false && "Should not be called since there is no packet accessor");
- return true;
- }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- typedef typename Self::Index Index;
-
- if (num_preserved_vals % 2 != 0) {
- // Not supported yet, revert to the slower code path
- return true;
- }
-
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
- const int block_size = /*256*/128;
- const int num_per_thread = /*128*/64;
- const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / block_size;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
- if (num_blocks > 1) {
- // We initialize the outputs outside the reduction kernel when we can't be sure that there
- // won't be a race conditions between multiple thread blocks.
- const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / 1024;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
- dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
- }
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
- return false;
- }
-};
-#endif
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats and half floats.
-#if defined(EIGEN_HAS_HIP_FP16)
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- (internal::is_same<typename Self::CoeffReturnType, float>::value ||
- internal::is_same<typename Self::CoeffReturnType, double>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- (internal::is_same<typename Self::CoeffReturnType, float>::value ||
- internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
- template <typename OutputType>
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- // Don't crash when we're called with an input tensor of size 0.
- if (num_coeffs == 0) {
- return true;
- }
- // It's faster to use the usual code.
- if (num_coeffs_to_reduce <= 128) {
- return true;
- }
-
- return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
- }
-};
-
-template <int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
- typename Self::CoeffReturnType* output) {
- const Index num_threads = hipBlockDim_x * hipGridDim_x;
- const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
- // Initialize the output values if they weren't initialized by the ReductionInitKernel
- if (hipGridDim_x == 1) {
- for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
- output[i] = reducer.initialize();
- }
- __syncthreads();
- }
-
- // Do the reduction.
- const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
- for (Index i = thread_id; i < max_iter; i += num_threads) {
- const Index input_col = i % num_preserved_coeffs;
- const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
- typename Self::CoeffReturnType reduced_val = reducer.initialize();
- const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
- for (Index j = input_row; j < max_row; j++) {
- typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
- reducer.reduce(val, &reduced_val);
- }
- atomicReduce(&(output[input_col]), reduced_val, reducer);
- }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- (internal::is_same<typename Self::CoeffReturnType, float>::value ||
- internal::is_same<typename Self::CoeffReturnType, double>::value);
- template <typename Device, typename OutputType>
- static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce doubles or floats on a gpu device");
- return true;
- }
-
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- typedef typename Self::Index Index;
-
- // It's faster to use the usual code.
- if (num_coeffs_to_reduce <= 32) {
- return true;
- }
-
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
- const int block_size = 256;
- const int num_per_thread = 16;
- const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / block_size;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
- if (num_blocks > 1) {
- // We initialize the outputs in the reduction kernel itself when we don't have to worry
- // about race conditions between multiple thread blocks.
- const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
- const int max_blocks = device.getNumHipMultiProcessors() *
- device.maxHipThreadsPerMultiProcessor() / 1024;
- const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
- dim3(num_blocks), dim3(1024), 0, device.stream(),
- reducer.initialize(), num_preserved_vals, output);
- }
-
- hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
- dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
- return false;
- }
-};
-
-#endif
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu
deleted file mode 100644
index 57d6ca000..000000000
--- a/unsupported/test/cxx11_tensor_argmax_hip.cu
+++ /dev/null
@@ -1,251 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template <int Layout>
-void test_hip_simple_argmax()
-{
- Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
- Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
- Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
- in.setRandom();
- in *= in.constant(100.0);
- in(0, 0, 0) = -1000.0;
- in(71, 52, 96) = 1000.0;
-
- std::size_t in_bytes = in.size() * sizeof(double);
- std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
-
- double* d_in;
- DenseIndex* d_out_max;
- DenseIndex* d_out_min;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out_max), out_bytes);
- hipMalloc((void**)(&d_out_min), out_bytes);
-
- hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
-
- gpu_out_max.device(gpu_device) = gpu_in.argmax();
- gpu_out_min.device(gpu_device) = gpu_in.argmin();
-
- assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
- VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
-
- hipFree(d_in);
- hipFree(d_out_max);
- hipFree(d_out_min);
-}
-
-template <int DataLayout>
-void test_hip_argmax_dim()
-{
- Tensor<float, 4, DataLayout> tensor(2,3,5,7);
- std::vector<int> dims;
- dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
- for (int dim = 0; dim < 4; ++dim) {
- tensor.setRandom();
- tensor = (tensor + tensor.constant(0.5)).log();
-
- array<DenseIndex, 3> out_shape;
- for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
- Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
- array<DenseIndex, 4> ix;
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != 0) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
- tensor(ix) = 10.0;
- }
- }
- }
- }
-
- std::size_t in_bytes = tensor.size() * sizeof(float);
- std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
- float* d_in;
- DenseIndex* d_out;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
- gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(tensor_arg.size(),
- size_t(2*3*5*7 / tensor.dimension(dim)));
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the first index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
- }
-
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != tensor.dimension(dim) - 1) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
- tensor(ix) = 20.0;
- }
- }
- }
- }
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the last index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
- }
-
- hipFree(d_in);
- hipFree(d_out);
- }
-}
-
-template <int DataLayout>
-void test_hip_argmin_dim()
-{
- Tensor<float, 4, DataLayout> tensor(2,3,5,7);
- std::vector<int> dims;
- dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
- for (int dim = 0; dim < 4; ++dim) {
- tensor.setRandom();
- tensor = (tensor + tensor.constant(0.5)).log();
-
- array<DenseIndex, 3> out_shape;
- for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
- Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
- array<DenseIndex, 4> ix;
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != 0) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
- tensor(ix) = -10.0;
- }
- }
- }
- }
-
- std::size_t in_bytes = tensor.size() * sizeof(float);
- std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
- float* d_in;
- DenseIndex* d_out;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
- gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(tensor_arg.size(),
- 2*3*5*7 / tensor.dimension(dim));
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect min to be in the first index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
- }
-
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != tensor.dimension(dim) - 1) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
- tensor(ix) = -20.0;
- }
- }
- }
- }
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the last index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
- }
-
- hipFree(d_in);
- hipFree(d_out);
- }
-}
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
- CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
- CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
- CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
- CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
- CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
deleted file mode 100644
index bf6a49df4..000000000
--- a/unsupported/test/cxx11_tensor_cast_float16_hip.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_conversion() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- Tensor<float, 1> floats(num_elem);
- floats.setRandom();
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
- d_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
- d_conv, num_elem);
-
- gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
-
- gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
- gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
- Tensor<float, 1> initial(num_elem);
- Tensor<float, 1> final(num_elem);
- gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(initial(i), final(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_half);
- gpu_device.deallocate(d_conv);
-}
-
-
-void test_fallback_conversion() {
- int num_elem = 101;
- Tensor<float, 1> floats(num_elem);
- floats.setRandom();
-
- Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
- Eigen::Tensor<float, 1> conv = halfs.cast<float>();
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(floats(i), conv(i));
- }
-}
-
-
-void test_cxx11_tensor_cast_float16_hip()
-{
- CALL_SUBTEST(test_hip_conversion());
- CALL_SUBTEST(test_fallback_conversion());
-}
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
deleted file mode 100644
index 652af0ab0..000000000
--- a/unsupported/test/cxx11_tensor_contract_hip.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_contraction(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 2, DataLayout> t_result(m_size, n_size);
- Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
- Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
-
-
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
- t_result = t_left.contract(t_right, dims);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
- continue;
- }
- std::cout << "mismatch detected at index " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_left);
- hipFree((void*)d_t_right);
- hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_scalar(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 0, DataLayout> t_result;
- Tensor<float, 0, DataLayout> t_result_gpu;
- Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_left(d_t_left, m_size, k_size);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_right(d_t_right, k_size, n_size);
- Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
- gpu_t_result(d_t_result);
-
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
- t_result = t_left.contract(t_right, dims);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
- !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
- std::cout << "mismatch detected: " << t_result()
- << " vs " << t_result_gpu() << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_left);
- hipFree((void*)d_t_right);
- hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_m() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(k, 128, 128);
- test_hip_contraction<RowMajor>(k, 128, 128);
- }
-}
-
-template<int DataLayout>
-void test_hip_contraction_k() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(128, k, 128);
- test_hip_contraction<RowMajor>(128, k, 128);
- }
-}
-
-template<int DataLayout>
-void test_hip_contraction_n() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(128, 128, k);
- test_hip_contraction<RowMajor>(128, 128, k);
- }
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_sizes() {
- int m_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257 , 511,
- 512, 513, 1023, 1024, 1025};
-
- int n_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257, 511,
- 512, 513, 1023, 1024, 1025};
-
- int k_sizes[] = { 31, 39, 63, 64, 65,
- 95, 96, 127, 129, 255,
- 257, 511, 512, 513, 1023,
- 1024, 1025};
-
- for (int i = 0; i < 15; i++) {
- for (int j = 0; j < 15; j++) {
- for (int k = 0; k < 17; k++) {
- test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
- }
- }
- }
-}
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
-
- CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
-
- CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
-
- CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
-
- CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
-
- // Commenting out these tests due to long runtimes
- // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
- // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu
deleted file mode 100644
index b98c481ff..000000000
--- a/unsupported/test/cxx11_tensor_device_hip.cu
+++ /dev/null
@@ -1,389 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-using Eigen::RowMajor;
-
-// Context for evaluation on cpu
-struct CPUContext {
- CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
- kernel_1d_(0) = 3.14f;
- kernel_1d_(1) = 2.7f;
-
- kernel_2d_(0,0) = 3.14f;
- kernel_2d_(1,0) = 2.7f;
- kernel_2d_(0,1) = 0.2f;
- kernel_2d_(1,1) = 7.0f;
-
- kernel_3d_(0,0,0) = 3.14f;
- kernel_3d_(0,1,0) = 2.7f;
- kernel_3d_(0,0,1) = 0.2f;
- kernel_3d_(0,1,1) = 7.0f;
- kernel_3d_(1,0,0) = -1.0f;
- kernel_3d_(1,1,0) = -0.3f;
- kernel_3d_(1,0,1) = -0.7f;
- kernel_3d_(1,1,1) = -0.5f;
- }
-
- const Eigen::DefaultDevice& device() const { return cpu_device_; }
-
- const Eigen::Tensor<float, 3>& in1() const { return in1_; }
- const Eigen::Tensor<float, 3>& in2() const { return in2_; }
- Eigen::Tensor<float, 3>& out() { return out_; }
- const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
- const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
- const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
-
- private:
- const Eigen::Tensor<float, 3>& in1_;
- const Eigen::Tensor<float, 3>& in2_;
- Eigen::Tensor<float, 3>& out_;
-
- Eigen::Tensor<float, 1> kernel_1d_;
- Eigen::Tensor<float, 2> kernel_2d_;
- Eigen::Tensor<float, 3> kernel_3d_;
-
- Eigen::DefaultDevice cpu_device_;
-};
-
-
-// Context for evaluation on GPU
-struct GPUContext {
- GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
- assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
- float kernel_1d_val[] = {3.14f, 2.7f};
- assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
- assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
- float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
- assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
- assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
- float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
- assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
- }
- ~GPUContext() {
- assert(hipFree(kernel_1d_) == hipSuccess);
- assert(hipFree(kernel_2d_) == hipSuccess);
- assert(hipFree(kernel_3d_) == hipSuccess);
- }
-
- const Eigen::GpuDevice& device() const { return gpu_device_; }
-
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
- Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
- Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
- Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
- Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
-
- private:
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
- Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
-
- float* kernel_1d_;
- float* kernel_2d_;
- float* kernel_3d_;
-
- Eigen::HipStreamDevice stream_;
- Eigen::GpuDevice gpu_device_;
-};
-
-
-// The actual expression to evaluate
-template <typename Context>
-void test_contextual_eval(Context* context)
-{
- context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_forced_contextual_eval(Context* context)
-{
- context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_compound_assignment(Context* context)
-{
- context->out().device(context->device()) = context->in1().constant(2.718f);
- context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
-}
-
-
-template <typename Context>
-void test_contraction(Context* context)
-{
- Eigen::array<std::pair<int, int>, 2> dims;
- dims[0] = std::make_pair(1, 1);
- dims[1] = std::make_pair(2, 2);
-
- Eigen::array<int, 2> shape(40, 50*70);
-
- Eigen::DSizes<int, 2> indices(0,0);
- Eigen::DSizes<int, 2> sizes(40,40);
-
- context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
-}
-
-
-template <typename Context>
-void test_1d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(40,49,70);
-
- Eigen::array<int, 1> dims(1);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
-}
-
-template <typename Context>
-void test_2d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(40,49,69);
-
- Eigen::array<int, 2> dims(1,2);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
-}
-
-template <typename Context>
-void test_3d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(39,49,69);
-
- Eigen::array<int, 3> dims(0,1,2);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
-}
-
-
-void test_cpu() {
- Eigen::Tensor<float, 3> in1(40,50,70);
- Eigen::Tensor<float, 3> in2(40,50,70);
- Eigen::Tensor<float, 3> out(40,50,70);
-
- in1 = in1.random() + in1.constant(10.0f);
- in2 = in2.random() + in2.constant(10.0f);
-
- CPUContext context(in1, in2, out);
- test_contextual_eval(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_forced_contextual_eval(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
- }
- }
- }
-
- test_compound_assignment(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_contraction(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 40; ++j) {
- const float result = out(i,j,0);
- float expected = 0;
- for (int k = 0; k < 50; ++k) {
- for (int l = 0; l < 70; ++l) {
- expected += in1(i, k, l) * in2(j, k, l);
- }
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
-
- test_1d_convolution(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
- }
- }
- }
-
- test_2d_convolution(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
- (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
- if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
- continue;
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-
- test_3d_convolution(&context);
- for (int i = 0; i < 39; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
- (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
- in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
- if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
- continue;
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-}
-
-void test_gpu() {
- Eigen::Tensor<float, 3> in1(40,50,70);
- Eigen::Tensor<float, 3> in2(40,50,70);
- Eigen::Tensor<float, 3> out(40,50,70);
- in1 = in1.random() + in1.constant(10.0f);
- in2 = in2.random() + in2.constant(10.0f);
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
-
- GPUContext context(gpu_in1, gpu_in2, gpu_out);
- test_contextual_eval(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_forced_contextual_eval(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
- }
- }
- }
-
- test_compound_assignment(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_contraction(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 40; ++j) {
- const float result = out(i,j,0);
- float expected = 0;
- for (int k = 0; k < 50; ++k) {
- for (int l = 0; l < 70; ++l) {
- expected += in1(i, k, l) * in2(j, k, l);
- }
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
-
- test_1d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
- }
- }
- }
-
- test_2d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-
- /*
- test_3d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 39; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
- in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
- in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
- */
-}
-
-
-void test_cxx11_tensor_device()
-{
- CALL_SUBTEST(test_cpu());
- CALL_SUBTEST(test_gpu());
-}
diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu
deleted file mode 100644
index b28840267..000000000
--- a/unsupported/test/cxx11_tensor_hip.cu
+++ /dev/null
@@ -1,1295 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#ifdef __NVCC__
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
-#endif
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_nullary() {
- Tensor<float, 1, 0, int> in1(2);
- Tensor<float, 1, 0, int> in2(2);
- in1.setRandom();
- in2.setRandom();
-
- std::size_t tensor_bytes = in1.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- hipMalloc((void**)(&d_in1), tensor_bytes);
- hipMalloc((void**)(&d_in2), tensor_bytes);
- hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
- d_in1, 2);
- Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
- d_in2, 2);
-
- gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
- gpu_in2.device(gpu_device) = gpu_in2.random();
-
- Tensor<float, 1, 0, int> new1(2);
- Tensor<float, 1, 0, int> new2(2);
-
- assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
-
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 2; ++i) {
- VERIFY_IS_APPROX(new1(i), 3.14f);
- VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
-}
-
-void test_hip_elementwise_small() {
- Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
- Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
- Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
- in1.setRandom();
- in2.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
- d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
- d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
- d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
-
- gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 2; ++i) {
- VERIFY_IS_APPROX(
- out(Eigen::array<Eigen::DenseIndex, 1>{i}),
- in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
- hipFree(d_out);
-}
-
-void test_hip_elementwise()
-{
- Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- in1.setRandom();
- in2.setRandom();
- in3.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t in3_bytes = in3.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_in3;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_in3), in3_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-
- gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 53; ++j) {
- for (int k = 0; k < 97; ++k) {
- VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
- }
- }
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
- hipFree(d_in3);
- hipFree(d_out);
-}
-
-void test_hip_props() {
- Tensor<float, 1> in1(200);
- Tensor<bool, 1> out(200);
- in1.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(bool);
-
- float* d_in1;
- bool* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
- d_in1, 200);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
- d_out, 200);
-
- gpu_out.device(gpu_device) = (gpu_in1.isnan)();
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 200; ++i) {
- VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
- }
-
- hipFree(d_in1);
- hipFree(d_out);
-}
-
-void test_hip_reduction()
-{
- Tensor<float, 4> in1(72,53,97,113);
- Tensor<float, 2> out(72,97);
- in1.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- array<Eigen::DenseIndex, 2> reduction_axis;
- reduction_axis[0] = 1;
- reduction_axis[1] = 3;
-
- gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- float expected = 0;
- for (int k = 0; k < 53; ++k) {
- for (int l = 0; l < 113; ++l) {
- expected =
- std::max<float>(expected, in1(i, k, j, l));
- }
- }
- VERIFY_IS_APPROX(out(i,j), expected);
- }
- }
-
- hipFree(d_in1);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_contraction()
-{
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
- Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
- Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
-
- typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
- MapXf m_left(t_left.data(), 300, 93);
- MapXf m_right(t_right.data(), 93, 140);
- Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
-
- typedef Tensor<float, 1>::DimensionPair DimPair;
- Eigen::array<DimPair, 2> dims;
- dims[0] = DimPair(2, 0);
- dims[1] = DimPair(3, 1);
-
- m_result = m_left * m_right;
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-
- hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
- std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
- assert(false);
- }
- }
-
- hipFree(d_t_left);
- hipFree(d_t_right);
- hipFree(d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_convolution_1d()
-{
- Tensor<float, 4, DataLayout> input(74,37,11,137);
- Tensor<float, 1, DataLayout> kernel(4);
- Tensor<float, 4, DataLayout> out(74,34,11,137);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
- Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{1};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 34; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 137; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
- input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_col_major_1d()
-{
- Tensor<float, 4, ColMajor> input(74,9,11,7);
- Tensor<float, 1, ColMajor> kernel(4);
- Tensor<float, 4, ColMajor> out(71,9,11,7);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
- Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{0};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 71; ++i) {
- for (int j = 0; j < 9; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 7; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
- input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_row_major_1d()
-{
- Tensor<float, 4, RowMajor> input(7,9,11,74);
- Tensor<float, 1, RowMajor> kernel(4);
- Tensor<float, 4, RowMajor> out(7,9,11,71);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
- Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{3};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 7; ++i) {
- for (int j = 0; j < 9; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 71; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
- input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_2d()
-{
- Tensor<float, 4, DataLayout> input(74,37,11,137);
- Tensor<float, 2, DataLayout> kernel(3,4);
- Tensor<float, 4, DataLayout> out(74,35,8,137);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
-
- Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 35; ++j) {
- for (int k = 0; k < 8; ++k) {
- for (int l = 0; l < 137; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
- input(i,j+1,k+0,l) * kernel(1,0) +
- input(i,j+2,k+0,l) * kernel(2,0) +
- input(i,j+0,k+1,l) * kernel(0,1) +
- input(i,j+1,k+1,l) * kernel(1,1) +
- input(i,j+2,k+1,l) * kernel(2,1) +
- input(i,j+0,k+2,l) * kernel(0,2) +
- input(i,j+1,k+2,l) * kernel(1,2) +
- input(i,j+2,k+2,l) * kernel(2,2) +
- input(i,j+0,k+3,l) * kernel(0,3) +
- input(i,j+1,k+3,l) * kernel(1,3) +
- input(i,j+2,k+3,l) * kernel(2,3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_3d()
-{
- Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
- Tensor<float, 3, DataLayout> kernel(3,4,2);
- Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
-
- Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 35; ++j) {
- for (int k = 0; k < 8; ++k) {
- for (int l = 0; l < 136; ++l) {
- for (int m = 0; m < 17; ++m) {
- const float result = out(i,j,k,l,m);
- const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
- input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
- input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
- input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
- input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
- input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
- input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
- input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
- input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
- input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
- input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
- input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
- input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
- input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
- input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
- input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
- input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
- input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
- input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
- input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
- input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
- input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
- input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
- input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-
-template <typename Scalar>
-void test_hip_lgamma(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.lgamma();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_digamma()
-{
- Tensor<Scalar, 1> in(7);
- Tensor<Scalar, 1> out(7);
- Tensor<Scalar, 1> expected_out(7);
- out.setZero();
-
- in(0) = Scalar(1);
- in(1) = Scalar(1.5);
- in(2) = Scalar(4);
- in(3) = Scalar(-10.5);
- in(4) = Scalar(10000.5);
- in(5) = Scalar(0);
- in(6) = Scalar(-1);
-
- expected_out(0) = Scalar(-0.5772156649015329);
- expected_out(1) = Scalar(0.03648997397857645);
- expected_out(2) = Scalar(1.2561176684318);
- expected_out(3) = Scalar(2.398239129535781);
- expected_out(4) = Scalar(9.210340372392849);
- expected_out(5) = std::numeric_limits<Scalar>::infinity();
- expected_out(6) = std::numeric_limits<Scalar>::infinity();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
- gpu_out.device(gpu_device) = gpu_in.digamma();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 5; ++i) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- for (int i = 5; i < 7; ++i) {
- VERIFY_IS_EQUAL(out(i), expected_out(i));
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_zeta()
-{
- Tensor<Scalar, 1> in_x(6);
- Tensor<Scalar, 1> in_q(6);
- Tensor<Scalar, 1> out(6);
- Tensor<Scalar, 1> expected_out(6);
- out.setZero();
-
- in_x(0) = Scalar(1);
- in_x(1) = Scalar(1.5);
- in_x(2) = Scalar(4);
- in_x(3) = Scalar(-10.5);
- in_x(4) = Scalar(10000.5);
- in_x(5) = Scalar(3);
-
- in_q(0) = Scalar(1.2345);
- in_q(1) = Scalar(2);
- in_q(2) = Scalar(1.5);
- in_q(3) = Scalar(3);
- in_q(4) = Scalar(1.0001);
- in_q(5) = Scalar(-2.5);
-
- expected_out(0) = std::numeric_limits<Scalar>::infinity();
- expected_out(1) = Scalar(1.61237534869);
- expected_out(2) = Scalar(0.234848505667);
- expected_out(3) = Scalar(1.03086757337e-5);
- expected_out(4) = Scalar(0.367879440865);
- expected_out(5) = Scalar(0.054102025820864097);
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_q;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_q), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
-
- gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(out(0), expected_out(0));
- VERIFY((std::isnan)(out(3)));
-
- for (int i = 1; i < 6; ++i) {
- if (i != 3) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- }
-
- hipFree(d_in_x);
- hipFree(d_in_q);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_polygamma()
-{
- Tensor<Scalar, 1> in_x(7);
- Tensor<Scalar, 1> in_n(7);
- Tensor<Scalar, 1> out(7);
- Tensor<Scalar, 1> expected_out(7);
- out.setZero();
-
- in_n(0) = Scalar(1);
- in_n(1) = Scalar(1);
- in_n(2) = Scalar(1);
- in_n(3) = Scalar(17);
- in_n(4) = Scalar(31);
- in_n(5) = Scalar(28);
- in_n(6) = Scalar(8);
-
- in_x(0) = Scalar(2);
- in_x(1) = Scalar(3);
- in_x(2) = Scalar(25.5);
- in_x(3) = Scalar(4.7);
- in_x(4) = Scalar(11.8);
- in_x(5) = Scalar(17.7);
- in_x(6) = Scalar(30.2);
-
- expected_out(0) = Scalar(0.644934066848);
- expected_out(1) = Scalar(0.394934066848);
- expected_out(2) = Scalar(0.0399946696496);
- expected_out(3) = Scalar(293.334565435);
- expected_out(4) = Scalar(0.445487887616);
- expected_out(5) = Scalar(-2.47810300902e-07);
- expected_out(6) = Scalar(-8.29668781082e-09);
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_n;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_n), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
- gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 7; ++i) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
-
- hipFree(d_in_x);
- hipFree(d_in_n);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igamma()
-{
- Tensor<Scalar, 2> a(6, 6);
- Tensor<Scalar, 2> x(6, 6);
- Tensor<Scalar, 2> out(6, 6);
- out.setZero();
-
- Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
- Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- a(i, j) = a_s[i];
- x(i, j) = x_s[j];
- }
- }
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
- Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
- {0.0, 0.6321205588285578, 0.7768698398515702,
- 0.9816843611112658, 9.999500016666262e-05, 1.0},
- {0.0, 0.4275932955291202, 0.608374823728911,
- 0.9539882943107686, 7.522076445089201e-07, 1.0},
- {0.0, 0.01898815687615381, 0.06564245437845008,
- 0.5665298796332909, 4.166333347221828e-18, 1.0},
- {0.0, 0.9999780593618628, 0.9999899967080838,
- 0.9999996219837988, 0.9991370418689945, 1.0},
- {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-
-
-
- std::size_t bytes = a.size() * sizeof(Scalar);
-
- Scalar* d_a;
- Scalar* d_x;
- Scalar* d_out;
- assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
- hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
- gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- if ((std::isnan)(igamma_s[i][j])) {
- VERIFY((std::isnan)(out(i, j)));
- } else {
- VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
- }
- }
- }
-
- hipFree(d_a);
- hipFree(d_x);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igammac()
-{
- Tensor<Scalar, 2> a(6, 6);
- Tensor<Scalar, 2> x(6, 6);
- Tensor<Scalar, 2> out(6, 6);
- out.setZero();
-
- Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
- Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- a(i, j) = a_s[i];
- x(i, j) = x_s[j];
- }
- }
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
- Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
- {1.0, 0.36787944117144233, 0.22313016014842982,
- 0.018315638888734182, 0.9999000049998333, 0.0},
- {1.0, 0.5724067044708798, 0.3916251762710878,
- 0.04601170568923136, 0.9999992477923555, 0.0},
- {1.0, 0.9810118431238462, 0.9343575456215499,
- 0.4334701203667089, 1.0, 0.0},
- {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
- 3.7801620118431334e-07, 0.0008629581310054535,
- 0.0},
- {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
-
- std::size_t bytes = a.size() * sizeof(Scalar);
-
- Scalar* d_a;
- Scalar* d_x;
- Scalar* d_out;
- hipMalloc((void**)(&d_a), bytes);
- hipMalloc((void**)(&d_x), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
- gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- if ((std::isnan)(igammac_s[i][j])) {
- VERIFY((std::isnan)(out(i, j)));
- } else {
- VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
- }
- }
- }
-
- hipFree(d_a);
- hipFree(d_x);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erf(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.erf();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erfc(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.erfc();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_betainc()
-{
- Tensor<Scalar, 1> in_x(125);
- Tensor<Scalar, 1> in_a(125);
- Tensor<Scalar, 1> in_b(125);
- Tensor<Scalar, 1> out(125);
- Tensor<Scalar, 1> expected_out(125);
- out.setZero();
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
- Array<Scalar, 1, Dynamic> x(125);
- Array<Scalar, 1, Dynamic> a(125);
- Array<Scalar, 1, Dynamic> b(125);
- Array<Scalar, 1, Dynamic> v(125);
-
- a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
- 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
- 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
-
- b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999;
-
- x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
- 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
- 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
- 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
- 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
- -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
- 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
- 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
- 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
-
- v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
- nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
- nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
- 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
- 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
- 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
- nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
- 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
- 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
- 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
- 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
- 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
- nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
- nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
- nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
- nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
- 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
- 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
- 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
- 0.5000000000004947, 0.9999999999999999, nan;
-
- for (int i = 0; i < 125; ++i) {
- in_x(i) = x(i);
- in_a(i) = a(i);
- in_b(i) = b(i);
- expected_out(i) = v(i);
- }
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_a;
- Scalar* d_in_b;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_a), bytes);
- hipMalloc((void**)(&d_in_b), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
-
- gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 1; i < 125; ++i) {
- if ((std::isnan)(expected_out(i))) {
- VERIFY((std::isnan)(out(i)));
- } else {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- }
-
- hipFree(d_in_x);
- hipFree(d_in_a);
- hipFree(d_in_b);
- hipFree(d_out);
-}
-
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_nullary());
- CALL_SUBTEST(test_hip_elementwise_small());
- CALL_SUBTEST(test_hip_elementwise());
- CALL_SUBTEST(test_hip_props());
- CALL_SUBTEST(test_hip_reduction());
- CALL_SUBTEST(test_hip_contraction<ColMajor>());
- CALL_SUBTEST(test_hip_contraction<RowMajor>());
- CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
- CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
- CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
- CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
- CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
- CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
- // The following two tests commented out due to long runtime
- // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
- // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
- // std::erf, std::erfc, and so on where only added in c++11. We use them
- // as a golden reference to validate the results produced by Eigen. Therefore
- // we can only run these tests if we use a c++11 compiler.
-
- CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
- CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
- CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
- CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
-
- CALL_SUBTEST(test_hip_lgamma<double>(1.0));
- CALL_SUBTEST(test_hip_lgamma<double>(100.0));
- CALL_SUBTEST(test_hip_lgamma<double>(0.01));
- CALL_SUBTEST(test_hip_lgamma<double>(0.001));
-
- CALL_SUBTEST(test_hip_erf<float>(1.0f));
- CALL_SUBTEST(test_hip_erf<float>(100.0f));
- CALL_SUBTEST(test_hip_erf<float>(0.01f));
- CALL_SUBTEST(test_hip_erf<float>(0.001f));
-
- CALL_SUBTEST(test_hip_erfc<float>(1.0f));
- // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
- CALL_SUBTEST(test_hip_erfc<float>(5.0f));
- CALL_SUBTEST(test_hip_erfc<float>(0.01f));
- CALL_SUBTEST(test_hip_erfc<float>(0.001f));
-
- CALL_SUBTEST(test_hip_erf<double>(1.0));
- CALL_SUBTEST(test_hip_erf<double>(100.0));
- CALL_SUBTEST(test_hip_erf<double>(0.01));
- CALL_SUBTEST(test_hip_erf<double>(0.001));
-
- CALL_SUBTEST(test_hip_erfc<double>(1.0));
- // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
- CALL_SUBTEST(test_hip_erfc<double>(5.0));
- CALL_SUBTEST(test_hip_erfc<double>(0.01));
- CALL_SUBTEST(test_hip_erfc<double>(0.001));
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_digamma<float>());
- // CALL_SUBTEST(test_hip_digamma<double>());
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_polygamma<float>());
- // CALL_SUBTEST(test_hip_polygamma<double>());
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_zeta<float>());
- // CALL_SUBTEST(test_hip_zeta<double>());
-
- CALL_SUBTEST(test_hip_igamma<float>());
- CALL_SUBTEST(test_hip_igammac<float>());
-
- CALL_SUBTEST(test_hip_igamma<double>());
- CALL_SUBTEST(test_hip_igammac<double>());
-
- CALL_SUBTEST(test_hip_betainc<float>());
- CALL_SUBTEST(test_hip_betainc<double>());
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu
deleted file mode 100644
index 6c1a401bf..000000000
--- a/unsupported/test/cxx11_tensor_of_float16_hip.cu
+++ /dev/null
@@ -1,498 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-
-template<typename>
-void test_hip_numext() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
- bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
- gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
-
- Tensor<bool, 1> half_prec(num_elem);
- Tensor<bool, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking numext " << i << std::endl;
- VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-
-#ifdef EIGEN_HAS_HIP_FP16
-
-template<typename>
-void test_hip_conversion() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
- d_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
- d_conv, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random();
- gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
- gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
- Tensor<float, 1> initial(num_elem);
- Tensor<float, 1> final(num_elem);
- gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(initial(i), final(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_half);
- gpu_device.deallocate(d_conv);
-}
-
-template<typename>
-void test_hip_unary() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.abs();
- gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
-
- Tensor<float, 1> half_prec(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking unary " << i << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_elementwise() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
- d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
- d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float1.device(gpu_device) = gpu_float1.random();
- gpu_float2.device(gpu_device) = gpu_float2.random();
- gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
- gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
-
- Tensor<float, 1> half_prec(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
- VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_trancendental() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
- gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
- gpu_float3.device(gpu_device) = gpu_float3.random();
- gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
- gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
- gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
- gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
-
- gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
- gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
-
- gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
- gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
-
- gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
- gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
-
- gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
- gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
-
- Tensor<float, 1> input1(num_elem);
- Tensor<Eigen::half, 1> half_prec1(num_elem);
- Tensor<Eigen::half, 1> full_prec1(num_elem);
- Tensor<float, 1> input2(num_elem);
- Tensor<Eigen::half, 1> half_prec2(num_elem);
- Tensor<Eigen::half, 1> full_prec2(num_elem);
- Tensor<float, 1> input3(num_elem);
- Tensor<Eigen::half, 1> half_prec3(num_elem);
- Tensor<Eigen::half, 1> full_prec3(num_elem);
- gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
- VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
- }
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
- if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
- VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
- else
- VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
- }
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
- VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
- }
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_float3);
- gpu_device.deallocate(d_res1_half);
- gpu_device.deallocate(d_res1_float);
- gpu_device.deallocate(d_res2_half);
- gpu_device.deallocate(d_res2_float);
- gpu_device.deallocate(d_res3_float);
- gpu_device.deallocate(d_res3_half);
-}
-
-template<typename>
-void test_hip_contractions() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int rows = 23;
- int cols = 23;
- int num_elem = rows*cols;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
- d_res_half, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
- d_res_float, rows, cols);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
- gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
-
- typedef Tensor<float, 2>::DimensionPair DimPair;
- Eigen::array<DimPair, 1> dims(DimPair(1, 0));
- gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
-
- Tensor<Eigen::half, 2> half_prec(rows, cols);
- Tensor<Eigen::half, 2> full_prec(rows, cols);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < rows; ++i) {
- for (int j = 0; j < cols; ++j) {
- std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
- if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
- VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
- }
- }
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions(int size1, int size2, int redux) {
-
- std::cout << "Reducing " << size1 << " by " << size2
- << " tensor along dim " << redux << std::endl;
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = size1*size2;
- int result_size = (redux == 1 ? size1 : size2);
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size1, size2);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size1, size2);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, result_size);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, result_size);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
- gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
-
- Eigen::array<int, 1> redux_dim(redux);
- gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
-
- Tensor<Eigen::half, 1> half_prec(result_size);
- Tensor<Eigen::half, 1> full_prec(result_size);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < result_size; ++i) {
- std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions() {
- test_hip_reductions<void>(13, 13, 0);
- test_hip_reductions<void>(13, 13, 1);
-
- test_hip_reductions<void>(35, 36, 0);
- test_hip_reductions<void>(35, 36, 1);
-
- test_hip_reductions<void>(36, 35, 0);
- test_hip_reductions<void>(36, 35, 1);
-}
-
-template<typename>
-void test_hip_full_reductions() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int size = 13;
- int num_elem = size*size;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size, size);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size, size);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
- d_res_half);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
- d_res_float);
-
- gpu_float1.device(gpu_device) = gpu_float1.random();
- gpu_float2.device(gpu_device) = gpu_float2.random();
-
- gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
-
- Tensor<Eigen::half, 0> half_prec;
- Tensor<Eigen::half, 0> full_prec;
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
- gpu_device.synchronize();
-
- VERIFY_IS_APPROX(full_prec(), half_prec());
-
- gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
- gpu_device.synchronize();
-
- VERIFY_IS_APPROX(full_prec(), half_prec());
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_forced_evals() {
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
- d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
- d_res_half2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- Eigen::array<int, 1> no_bcast;
- no_bcast[0] = 1;
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.abs();
- gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
- gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
-
- Tensor<float, 1> half_prec1(num_elem);
- Tensor<float, 1> half_prec2(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
- VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half1);
- gpu_device.deallocate(d_res_half2);
- gpu_device.deallocate(d_res_float);
-}
-#endif
-
-
-void test_cxx11_tensor_of_float16_hip()
-{
- CALL_SUBTEST(test_hip_numext<void>());
-
-#ifdef EIGEN_HAS_HIP_FP16
- CALL_SUBTEST(test_hip_conversion<void>());
- CALL_SUBTEST(test_hip_unary<void>());
- CALL_SUBTEST(test_hip_elementwise<void>());
- CALL_SUBTEST(test_hip_trancendental<void>());
- CALL_SUBTEST(test_hip_contractions<void>());
- CALL_SUBTEST(test_hip_reductions<void>());
- CALL_SUBTEST(test_hip_full_reductions<void>());
- CALL_SUBTEST(test_hip_forced_evals<void>());
-#else
- std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu
deleted file mode 100644
index 7d7e72ca2..000000000
--- a/unsupported/test/cxx11_tensor_random_hip.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <Eigen/CXX11/Tensor>
-
-
-void test_hip_random_uniform()
-{
- Tensor<float, 2> out(72,97);
- out.setZero();
-
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_out;
- hipMalloc((void**)(&d_out), out_bytes);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- gpu_out.device(gpu_device) = gpu_out.random();
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- // For now we just check thes code doesn't crash.
- // TODO: come up with a valid test of randomness
-}
-
-
-void test_hip_random_normal()
-{
- Tensor<float, 2> out(72,97);
- out.setZero();
-
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_out;
- hipMalloc((void**)(&d_out), out_bytes);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- Eigen::internal::NormalRandomGenerator<float> gen(true);
- gpu_out.device(gpu_device) = gpu_out.random(gen);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-}
-
-static void test_complex()
-{
- Tensor<std::complex<float>, 1> vec(6);
- vec.setRandom();
-
- // Fixme: we should check that the generated numbers follow a uniform
- // distribution instead.
- for (int i = 1; i < 6; ++i) {
- VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
- }
-}
-
-
-void test_cxx11_tensor_random_hip()
-{
- CALL_SUBTEST(test_hip_random_uniform());
- CALL_SUBTEST(test_hip_random_normal());
- CALL_SUBTEST(test_complex());
-}
diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu
deleted file mode 100644
index c5aad05be..000000000
--- a/unsupported/test/cxx11_tensor_reduction_hip.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-template<typename Type, int DataLayout>
-static void test_full_reductions() {
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- const int num_rows = internal::random<int>(1024, 5*1024);
- const int num_cols = internal::random<int>(1024, 5*1024);
-
- Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
- in.setRandom();
-
- Tensor<Type, 0, DataLayout> full_redux;
- full_redux = in.sum();
-
- std::size_t in_bytes = in.size() * sizeof(Type);
- std::size_t out_bytes = full_redux.size() * sizeof(Type);
- Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
- Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
- gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
-
- TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
- TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
-
- out_gpu.device(gpu_device) = in_gpu.sum();
-
- Tensor<Type, 0, DataLayout> full_redux_gpu;
- gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
- gpu_device.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
-
- gpu_device.deallocate(gpu_in_ptr);
- gpu_device.deallocate(gpu_out_ptr);
-}
-
-template<typename Type, int DataLayout>
-static void test_first_dim_reductions() {
- int dim_x = 33;
- int dim_y = 1;
- int dim_z = 128;
-
- Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
- in.setRandom();
-
- Eigen::array<int, 1> red_axis;
- red_axis[0] = 0;
- Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
- // Create device
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice dev(&stream);
-
- // Create data(T)
- Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
- Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
- Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
- Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
-
- // Perform operation
- dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
- gpu_out.device(dev) = gpu_in.sum(red_axis);
- gpu_out.device(dev) += gpu_in.sum(red_axis);
- Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
- dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
- dev.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- for (int i = 0; i < gpu_out.size(); ++i) {
- VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
- }
-
- dev.deallocate(in_data);
- dev.deallocate(out_data);
-}
-
-template<typename Type, int DataLayout>
-static void test_last_dim_reductions() {
- int dim_x = 128;
- int dim_y = 1;
- int dim_z = 33;
-
- Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
- in.setRandom();
-
- Eigen::array<int, 1> red_axis;
- red_axis[0] = 2;
- Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
- // Create device
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice dev(&stream);
-
- // Create data
- Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
- Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
- Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
- Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
-
- // Perform operation
- dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
- gpu_out.device(dev) = gpu_in.sum(red_axis);
- gpu_out.device(dev) += gpu_in.sum(red_axis);
- Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
- dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
- dev.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- for (int i = 0; i < gpu_out.size(); ++i) {
- VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
- }
-
- dev.deallocate(in_data);
- dev.deallocate(out_data);
-}
-
-
-void test_cxx11_tensor_reduction_hip() {
- CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
- CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
- CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
-
- CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
- CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
-// Outer reductions of doubles aren't supported just yet.
-// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
-
- CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
-// Outer reductions of doubles aren't supported just yet.
-// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
- CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
-}
diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu
deleted file mode 100644
index f4c4f59b9..000000000
--- a/unsupported/test/cxx11_tensor_scan_hip.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_cumsum(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
- Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
- Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
-
- t_input.setRandom();
-
- std::size_t t_input_bytes = t_input.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_input;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_input), t_input_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
- gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
- gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
-
- gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
- t_result = t_input.cumsum(1);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
- continue;
- }
- std::cout << "mismatch detected at index " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_input);
- hipFree((void*)d_t_result);
-}
-
-
-void test_cxx11_tensor_scan_hip()
-{
- CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
-}