aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-03-02 10:47:29 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-03-02 10:47:29 -0800
commita71943b9a432c8962f025b56313584f33111ace4 (patch)
treefe9ac180904471d768fc4beae596a5ced8c3e2e4 /unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
parent09ae0e6586b978ce1ea9960984e1228dfc8971b8 (diff)
Made the Tensor code compile with clang 3.9
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h101
1 files changed, 48 insertions, 53 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb40f..c04b784a4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
- typedef float Scalar;
// prefetch registers
float4 lhs_pf0, rhs_pf0;
@@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
}
-#define prefetch_lhs(reg, row, col) \
- if (!CHECK_LHS_BOUNDARY) { \
- if (col < k_size) { \
- reg =lhs.loadPacket<Unaligned>(row, col); \
- } \
- } else { \
- if (col < k_size) { \
- if (row + 3 < m_size) { \
- reg =lhs.loadPacket<Unaligned>(row, col); \
- } else if (row + 2 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- } else if (row + 1 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- } else if (row < m_size) { \
- reg.x =lhs(row + 0, col); \
- } \
- } \
- } \
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.template loadPacket<Unaligned>(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.template loadPacket<Unaligned>(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
Index lhs_vert = base_m+threadIdx.x*4;
@@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else {
if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
- typedef float Scalar;
// prefetch registers
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else {
// just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
@@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
typedef float2 LHS_MEM[64][32];
typedef float2 RHS_MEM[128][8];
- typedef float2 LHS_MEM16x16[32][16];
- typedef float2 RHS_MEM16x16[64][8];
-
const Index m_block_idx = blockIdx.x;
const Index n_block_idx = blockIdx.y;