aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2018-09-13 16:18:52 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2018-09-13 16:18:52 -0700
commitb3f4c067d96ccac919a2789113c3ac87eda43a00 (patch)
tree52930fdad5535663731bd2247f771c8f0adf7f18
parent2b070181401193f562f38179c1a3bb81496485cf (diff)
parent53568e3549e94269df6f6d71ca089161cfa097da (diff)
Merge
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h74
-rwxr-xr-xEigen/src/Core/util/DisableStupidWarnings.h4
-rw-r--r--Eigen/src/Core/util/Macros.h5
-rw-r--r--Eigen/src/IterativeLinearSolvers/ConjugateGradient.h5
-rw-r--r--Eigen/src/UmfPackSupport/UmfPackSupport.h3
-rw-r--r--test/gpu_common.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h145
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h339
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h157
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h76
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h222
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h350
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h424
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h253
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h7
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h3
-rwxr-xr-xunsupported/Eigen/src/AutoDiff/AutoDiffScalar.h3
-rw-r--r--unsupported/test/cxx11_tensor_block_access.cpp111
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp509
-rw-r--r--unsupported/test/cxx11_tensor_shuffling.cpp10
46 files changed, 2527 insertions, 241 deletions
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index b0a72e1f9..c4feda87d 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -43,7 +43,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -58,29 +58,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half&
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
return *reinterpret_cast<const half2*>(from);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[1]);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]);
}
-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from;
}
-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
to[0] = __low2half(from);
to[1] = __high2half(from);
}
template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -102,7 +102,7 @@ template<>
}
template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -123,20 +123,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::ha
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
return __halves2half2(from[0*stride], from[1*stride]);
}
-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
to[stride*0] = __low2half(from);
to[stride*1] = __high2half(from);
}
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
return __low2half(a);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
@@ -144,7 +144,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
}
-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
__half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]);
@@ -154,7 +154,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
kernel.packet[1] = __halves2half2(a2, b2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
@@ -171,7 +171,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
@@ -193,7 +193,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hsub2(a, b);
@@ -215,7 +215,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hneg2(a);
@@ -233,9 +233,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
@@ -257,7 +257,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hfma2(a, b, c);
@@ -281,7 +281,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
@@ -303,7 +303,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -313,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
return __halves2half2(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -323,7 +323,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
return __halves2half2(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd(__low2half(a), __high2half(a));
@@ -341,7 +341,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
@@ -363,7 +363,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
@@ -385,7 +385,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul(__low2half(a), __high2half(a));
@@ -403,7 +403,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
#endif
}
-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = log1pf(a1);
@@ -411,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expm1f(a1);
@@ -422,29 +422,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) {
return h2log(a);
}
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 pexp<half2>(const half2& a) {
return h2exp(a);
}
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 psqrt<half2>(const half2& a) {
return h2sqrt(a);
}
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 prsqrt<half2>(const half2& a) {
return h2rsqrt(a);
}
#else
-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = logf(a1);
@@ -452,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expf(a1);
@@ -460,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = sqrtf(a1);
@@ -468,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = rsqrtf(a1);
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index d04b52649..6e93bbc0f 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -52,6 +52,10 @@
#endif
// g++ warns about local variables shadowing member functions, which is too strict
#pragma GCC diagnostic ignored "-Wshadow"
+ #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+ // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+ #pragma GCC diagnostic ignored "-Wtype-limits"
+ #endif
#if __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index bcdede61e..3af6c4e37 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -533,8 +533,11 @@
#endif
// Does the compiler support result_of?
+// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that,
+// so let's be conservative.
#ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
+#if EIGEN_MAX_CPP_VER>=11 && \
+ (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
#define EIGEN_HAS_STD_RESULT_OF 1
#else
#define EIGEN_HAS_STD_RESULT_OF 0
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 395daa8e4..f7ce47134 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
tol_error = 0;
return;
}
- RealScalar threshold = tol*tol*rhsNorm2;
+ const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+ RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero);
RealScalar residualNorm2 = residual.squaredNorm();
if (residualNorm2 < threshold)
{
@@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
tol_error = sqrt(residualNorm2 / rhsNorm2);
return;
}
-
+
VectorType p(n);
p = precond.solve(residual); // initial search direction
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index ba10a9318..e3a333f80 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -613,7 +613,6 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
- StorageIndex errorCode;
Scalar* x_ptr = 0;
Matrix<Scalar,Dynamic,1> x_tmp;
if(x.innerStride()!=1)
@@ -625,7 +624,7 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
{
if(x.innerStride()==1)
x_ptr = &x.col(j).coeffRef(0);
- errorCode = umfpack_solve(UMFPACK_A,
+ StorageIndex errorCode = umfpack_solve(UMFPACK_A,
mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
m_numeric, m_control.data(), m_umfpackInfo.data());
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 3aac49e96..79d4ea694 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -61,9 +61,9 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
gpuDeviceSynchronize();
#ifdef EIGEN_USE_HIP
- hipLaunchKernelGGL(run_on_gpu_meta_kernel<Kernel,
- typename std::decay<decltype(*d_in)>::type,
- typename std::decay<decltype(*d_out)>::type>,
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel,
+ typename std::decay<decltype(*d_in)>::type,
+ typename std::decay<decltype(*d_out)>::type>),
dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
#else
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index c0f33ba2d..ea3ab329d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -87,6 +87,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -220,6 +221,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
index 442639868..5110e99ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -109,6 +109,7 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index bcaf5c97f..06bf422c5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -102,14 +102,16 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
static const int NumDims = XprType::NumDims;
enum {
- IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
- TensorEvaluator<RightArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
- TensorEvaluator<RightArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
- TensorEvaluator<RightArgType, Device>::BlockAccess,
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+ TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+ TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+ TensorEvaluator<RightArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+ TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
};
typedef typename internal::TensorBlock<
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 9b92b6179..ee70d1d76 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -376,6 +376,147 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
};
/**
+ * \class TensorBlockCwiseUnaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Carries out a cwise binary op on a number of coefficients.
+ *
+ * This class reads strided input from the argument, and writes the
+ * result of the cwise unary op to the strided output array.
+ *
+ */
+struct TensorBlockCwiseUnaryOp {
+ template <typename StorageIndex, typename UnaryFunctor,
+ typename OutputScalar, typename InputScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const UnaryFunctor& functor, const StorageIndex num_coeff,
+ const StorageIndex output_index, const StorageIndex output_stride,
+ OutputScalar* output_data, const StorageIndex input_index,
+ const StorageIndex input_stride, const InputScalar* input_data) {
+ typedef const Eigen::Array<InputScalar, Dynamic, 1> Input;
+ typedef Eigen::Array<OutputScalar, Dynamic, 1> Output;
+
+ typedef Eigen::Map<Input, 0, InnerStride<> > InputMap;
+ typedef Eigen::Map<Output, 0, InnerStride<> > OutputMap;
+
+ const InputScalar* input_base = &input_data[input_index];
+ OutputScalar* output_base = &output_data[output_index];
+
+ const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
+ OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
+
+ output = Eigen::CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
+ }
+};
+
+/**
+ * \class TensorBlockCwiseUnaryIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class for carrying out cwise unary ops.
+ *
+ * This class carries out the unary op on given blocks.
+ */
+template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
+ int NumDims, int Layout>
+struct TensorBlockCwiseUnaryIO {
+ typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
+ Layout>::Dimensions Dimensions;
+
+ struct BlockIteratorState {
+ StorageIndex output_stride, output_span;
+ StorageIndex input_stride, input_span;
+ StorageIndex size, count;
+ };
+
+ template <typename InputScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const UnaryFunctor& functor, const Dimensions& block_sizes,
+ const Dimensions& block_strides, OutputScalar* output_data,
+ const array<StorageIndex, NumDims>& input_strides,
+ const InputScalar* input_data) {
+ // Find the innermost dimension whose size is not 1. This is the effective
+ // inner dim. If all dimensions are of size 1, fallback to using the actual
+ // innermost dim to avoid out-of-bound access.
+ int num_size_one_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ if (block_sizes[dim] != 1) {
+ num_size_one_inner_dims = i;
+ break;
+ }
+ }
+ // Calculate strides and dimensions.
+ const int inner_dim =
+ NumDims == 0 ? 1
+ : cond<Layout>()(num_size_one_inner_dims,
+ NumDims - num_size_one_inner_dims - 1);
+ StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+ for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ // Merge multiple inner dims into one for larger inner dim size (i.e.
+ // fewer calls to TensorBlockCwiseUnaryOp::Run()).
+ if (inner_dim_size == block_strides[dim] &&
+ block_strides[dim] == input_strides[dim]) {
+ inner_dim_size *= block_sizes[dim];
+ ++num_size_one_inner_dims;
+ } else {
+ break;
+ }
+ }
+
+ StorageIndex output_index = 0, input_index = 0;
+
+ const StorageIndex output_stride =
+ NumDims == 0 ? 1 : block_strides[inner_dim];
+ const StorageIndex input_stride =
+ NumDims == 0 ? 1 : input_strides[inner_dim];
+
+ const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+ array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+ // Initialize block iterator state. Squeeze away any dimension of size 1.
+ int num_squeezed_dims = 0;
+ for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+ const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+ const StorageIndex size = block_sizes[dim];
+ if (size == 1) {
+ continue;
+ }
+ BlockIteratorState& state = block_iter_state[num_squeezed_dims];
+ state.output_stride = block_strides[dim];
+ state.input_stride = input_strides[dim];
+ state.size = size;
+ state.output_span = state.output_stride * (size - 1);
+ state.input_span = state.input_stride * (size - 1);
+ state.count = 0;
+ ++num_squeezed_dims;
+ }
+
+ // Compute cwise unary op.
+ const StorageIndex block_total_size =
+ NumDims == 0 ? 1 : block_sizes.TotalSize();
+ for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
+ TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index,
+ output_stride, output_data, input_index,
+ input_stride, input_data);
+ // Update index.
+ for (int j = 0; j < num_squeezed_dims; ++j) {
+ BlockIteratorState& state = block_iter_state[j];
+ if (++state.count < state.size) {
+ output_index += state.output_stride;
+ input_index += state.input_stride;
+ break;
+ }
+ state.count = 0;
+ output_index -= state.output_span;
+ input_index -= state.input_span;
+ }
+ }
+ }
+};
+
+/**
* \class TensorBlockCwiseBinaryOp
* \ingroup CXX11_Tensor_Module
*
@@ -736,8 +877,8 @@ class TensorBlockMapper {
// Tensor will not fit within 'min_target_size' budget: calculate tensor
// block dimension sizes based on "square" dimension size target.
const StorageIndex dim_size_target = internal::convert_index<StorageIndex>(
- std::pow(static_cast<float>(min_target_size),
- 1.0f / static_cast<float>(block_dim_sizes.rank())));
+ std::pow(static_cast<float>(min_target_size),
+ 1.0f / static_cast<float>(block_dim_sizes.rank())));
for (Index i = 0; i < block_dim_sizes.rank(); ++i) {
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
// a multiple of the packet size. Note that reducing
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 560e3ec22..e5cf93ab0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -108,16 +108,36 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
bool isCopy, nByOne, oneByN;
enum {
- IsAligned = true,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- RawAccess = false
+ IsAligned = true,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : isCopy(false), nByOne(false), oneByN(false), m_broadcast(op.broadcast()),m_impl(op.expression(), device)
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ // Block based access to the XprType (input) tensor.
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlock;
+ typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockReader;
+
+ // We do block based broadcasting using a trick with 2x tensor rank and 0
+ // strides. See block method implementation for details.
+ typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+ typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
+ BroadcastTensorBlock;
+ typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
+ BroadcastTensorBlockReader;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+ const Device& device)
+ : isCopy(false), nByOne(false), oneByN(false),
+ m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
{
+
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
// tensor with N >= 1 of 1 element first and then broadcast.
@@ -216,8 +236,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
// TODO: attempt to speed this up. The integer divisions and modulo are slow
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
- {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
@@ -243,11 +262,15 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
inputIndex += (index % m_impl.dimensions()[0]);
}
}
- return m_impl.coeff(inputIndex);
+ return inputIndex;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
{
+ return m_impl.coeff(indexColMajor(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
Index inputIndex = 0;
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
@@ -263,17 +286,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
index -= idx * m_outputStrides[i];
}
- if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
- eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+ if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+ eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
inputIndex += index;
} else {
- if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
- eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+ if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+ eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
} else {
- inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+ inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
}
}
- return m_impl.coeff(inputIndex);
+ return inputIndex;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+ {
+ return m_impl.coeff(indexRowMajor(index));
}
template<int LoadMode>
@@ -564,13 +592,290 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+ // tensors. But this might need further tuning.
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.firstLevelCacheSize() / sizeof(Scalar));
+
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ if (NumDims <= 0) {
+ output_block->data()[0] = m_impl.coeff(0);
+ return;
+ }
+
+ // Because we only support kSkewedInnerDims blocking, block size should be
+ // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
+ // for the first outer dim, and 1 for other outer dims. This is guaranteed
+ // by MergeResourceRequirements() in TensorBlock.h.
+ const Dimensions& output_block_sizes = output_block->block_sizes();
+ const Dimensions& output_block_strides = output_block->block_strides();
+
+ // Find where outer dims start.
+ int outer_dim_start = 0;
+ Index outer_dim_size = 1, inner_dim_size = 1;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i
+ : NumDims - i - 1;
+ if (i > outer_dim_start) {
+ eigen_assert(output_block_sizes[dim] == 1);
+ } else if (output_block_sizes[dim] != m_dimensions[dim]) {
+ eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
+ outer_dim_size = output_block_sizes[dim];
+ } else {
+ inner_dim_size *= output_block_sizes[dim];
+ ++outer_dim_start;
+ }
+ }
+
+ if (inner_dim_size == 0 || outer_dim_size == 0) {
+ return;
+ }
+
+ const Dimensions& input_dims = m_impl.dimensions();
+
+ // Pre-fill input_block_sizes, broadcast_block_sizes,
+ // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
+ // only modify the outer_dim_start-th dimension on these arrays.
+
+ // Calculate the input block size for looking into the input.
+ Dimensions input_block_sizes;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < outer_dim_start; ++i) {
+ input_block_sizes[i] = input_dims[i];
+ }
+ for (int i = outer_dim_start; i < NumDims; ++i) {
+ input_block_sizes[i] = 1;
+ }
+ } else {
+ for (int i = 0; i < outer_dim_start; ++i) {
+ input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
+ }
+ for (int i = outer_dim_start; i < NumDims; ++i) {
+ input_block_sizes[NumDims - i - 1] = 1;
+ }
+ }
+
+ // Broadcast with the 0-stride trick: Create 1 extra dim for each
+ // broadcast, set the input stride to 0.
+ //
+ // When ColMajor:
+ // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
+ //
+ // - broadcast_block_strides is [output_block_strides[0],
+ // output_block_strides[0] * d_0,
+ // output_block_strides[1],
+ // output_block_strides[1] * d_1,
+ // ...].
+ //
+ // - broadcast_tensor_strides is [output_block_strides[0],
+ // 0,
+ // output_block_strides[1],
+ // 0,
+ // ...].
+ BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
+ broadcast_tensor_strides;
+
+ for (int i = 0; i < outer_dim_start; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i
+ : NumDims - i - 1;
+ const int copy_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 2 * i
+ : 2 * NumDims - 2 * i - 1;
+ const int broadcast_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
+ : copy_dim - 1;
+ broadcast_block_sizes[copy_dim] = input_dims[dim];
+ broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+ broadcast_block_strides[copy_dim] = output_block_strides[dim];
+ broadcast_block_strides[broadcast_dim] =
+ output_block_strides[dim] * input_dims[dim];
+ broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
+ broadcast_tensor_strides[broadcast_dim] = 0;
+ }
+ for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i
+ : 2 * NumDims - i - 1;
+ broadcast_block_sizes[dim] = 1;
+ broadcast_block_strides[dim] = 0;
+ broadcast_tensor_strides[dim] = 0;
+ }
+
+ const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? outer_dim_start
+ : NumDims - outer_dim_start - 1;
+
+ if (outer_dim_size == 1) {
+ // We just need one block read using the ready-set values above.
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides, 0,
+ output_block);
+ } else if (input_dims[outer_dim] == 1) {
+ // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
+ const int broadcast_outer_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 2 * outer_dim_start + 1
+ : 2 * NumDims - 2 * outer_dim_start - 2;
+ broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
+ broadcast_tensor_strides[broadcast_outer_dim] = 0;
+ broadcast_block_strides[broadcast_outer_dim] =
+ output_block_strides[outer_dim];
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides, 0,
+ output_block);
+ } else {
+ // The general case. Let's denote the output block as x[...,
+ // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
+ // the outer_dim_start-th dimension (< NumDims). We need to split the
+ // a:a+outer_dim_size into possibly 3 sub-blocks:
+ //
+ // (1) a:b, where b is the smallest multiple of
+ // input_dims[outer_dim_start] in [a, a+outer_dim_size].
+ //
+ // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
+ // in [a, a+outer_dim_size].
+ //
+ // (3) c:a+outer_dim_size .
+ //
+ // Or, when b and c do not exist, we just need to process the whole block
+ // together.
+
+ // Find a.
+ const Index outer_dim_left_index =
+ output_block->first_coeff_index() / m_outputStrides[outer_dim];
+
+ // Find b and c.
+ const Index input_outer_dim_size = input_dims[outer_dim];
+
+ // First multiple after a. This is b when <= outer_dim_left_index +
+ // outer_dim_size.
+ const Index first_multiple =
+ divup<Index>(outer_dim_left_index, input_outer_dim_size) *
+ input_outer_dim_size;
+
+ if (first_multiple <= outer_dim_left_index + outer_dim_size) {
+ // b exists, so does c. Find it.
+ const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
+ input_outer_dim_size * input_outer_dim_size;
+ const int copy_outer_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 2 * outer_dim_start
+ : 2 * NumDims - 2 * outer_dim_start - 1;
+ const int broadcast_outer_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 2 * outer_dim_start + 1
+ : 2 * NumDims - 2 * outer_dim_start - 2;
+ if (first_multiple > outer_dim_left_index) {
+ const Index head_size = first_multiple - outer_dim_left_index;
+ input_block_sizes[outer_dim] = head_size;
+ broadcast_block_sizes[copy_outer_dim] = head_size;
+ broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+ broadcast_block_strides[copy_outer_dim] =
+ output_block_strides[outer_dim];
+ broadcast_block_sizes[broadcast_outer_dim] = 1;
+ broadcast_tensor_strides[broadcast_outer_dim] = 0;
+ broadcast_block_strides[broadcast_outer_dim] =
+ output_block_strides[outer_dim] * input_dims[outer_dim];
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides, 0,
+ output_block);
+ }
+ if (first_multiple < last_multiple) {
+ input_block_sizes[outer_dim] = input_outer_dim_size;
+ broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
+ broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+ broadcast_block_strides[copy_outer_dim] =
+ output_block_strides[outer_dim];
+ broadcast_block_sizes[broadcast_outer_dim] =
+ (last_multiple - first_multiple) / input_outer_dim_size;
+ broadcast_tensor_strides[broadcast_outer_dim] = 0;
+ broadcast_block_strides[broadcast_outer_dim] =
+ output_block_strides[outer_dim] * input_dims[outer_dim];
+ const Index offset = (first_multiple - outer_dim_left_index) *
+ m_outputStrides[outer_dim];
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides,
+ offset, output_block);
+ }
+ if (last_multiple < outer_dim_left_index + outer_dim_size) {
+ const Index tail_size =
+ outer_dim_left_index + outer_dim_size - last_multiple;
+ input_block_sizes[outer_dim] = tail_size;
+ broadcast_block_sizes[copy_outer_dim] = tail_size;
+ broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+ broadcast_block_strides[copy_outer_dim] =
+ output_block_strides[outer_dim];
+ broadcast_block_sizes[broadcast_outer_dim] = 1;
+ broadcast_tensor_strides[broadcast_outer_dim] = 0;
+ broadcast_block_strides[broadcast_outer_dim] =
+ output_block_strides[outer_dim] * input_dims[outer_dim];
+ const Index offset = (last_multiple - outer_dim_left_index) *
+ m_outputStrides[outer_dim];
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides,
+ offset, output_block);
+ }
+ } else {
+ // b and c do not exist.
+ const int copy_outer_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 2 * outer_dim_start
+ : 2 * NumDims - 2 * outer_dim_start - 1;
+ input_block_sizes[outer_dim] = outer_dim_size;
+ broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
+ broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+ broadcast_block_strides[copy_outer_dim] =
+ output_block_strides[outer_dim];
+ BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+ broadcast_block_strides, broadcast_tensor_strides, 0,
+ output_block);
+ }
+ }
+ }
+
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
Broadcast functor() const { return m_broadcast; }
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
+ const Dimensions& input_block_sizes,
+ const BroadcastDimensions& broadcast_block_sizes,
+ const BroadcastDimensions& broadcast_block_strides,
+ const BroadcastDimensions& broadcast_tensor_strides, Index offset,
+ TensorBlock* output_block) const {
+ TensorBlock input_view_block(
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? indexColMajor(output_block->first_coeff_index() + offset)
+ : indexRowMajor(output_block->first_coeff_index() + offset),
+ input_block_sizes, Dimensions(m_inputStrides),
+ Dimensions(m_inputStrides), NULL);
+
+ internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
+ input_view_block);
+ BroadcastTensorBlock broadcast_block(
+ 0, broadcast_block_sizes, broadcast_block_strides,
+ broadcast_tensor_strides, output_block->data() + offset);
+
+ BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
+ }
+
protected:
+ const Device& m_device;
const Broadcast m_broadcast;
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 3ab0a0f49..b47fa9e8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -144,14 +144,22 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets.
- IsAligned = false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+ InputTensorBlock;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ OutputTensorBlock;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
{
@@ -184,6 +192,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
m_inputStride *= input_dims[m_dim.actualDim()];
m_inputOffset = m_stride * op.offset();
+
+ if (BlockAccess) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+ }
+ } else {
+ m_inputStrides[NumInputDims - 1] = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+ }
+ }
+ }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -266,6 +288,61 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
TensorOpCost(0, 0, cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ m_impl.getResourceRequirements(resources);
+ }
+
+ // TODO(andydavis) Reduce the overhead of this function (experiment with
+ // using a fixed block size).
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ // Calculate input block sizes.
+ const DSizes<Index, NumDims>& output_block_sizes =
+ output_block->block_sizes();
+ const DSizes<Index, NumDims>& output_block_strides =
+ output_block->block_strides();
+ const Index chip_dim = m_dim.actualDim();
+ DSizes<Index, NumInputDims> input_block_sizes;
+ DSizes<Index, NumInputDims> input_block_strides;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (i < chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i];
+ input_block_strides[i] = output_block_strides[i];
+ } else if (i > chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i - 1];
+ input_block_strides[i] = output_block_strides[i - 1];
+ } else {
+ input_block_sizes[i] = 1;
+ }
+ }
+ // Fix up input_block_stride for chip dimension.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (chip_dim == 0) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+ }
+ } else {
+ if (chip_dim == NumInputDims - 1) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+ }
+ }
+ // Instantiate and read input block from input tensor.
+ InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+ input_block_sizes, input_block_strides,
+ m_inputStrides, output_block->data());
+ m_impl.block(&input_block);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
@@ -294,13 +371,14 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{
Index inputIndex;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset;
- } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
- // m_stride is aways greater than index, so let's avoid the integer division.
+ } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ // m_stride is aways greater than index, so let's avoid the integer
+ // division.
eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset;
} else {
@@ -316,6 +394,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Index m_stride;
Index m_inputOffset;
Index m_inputStride;
+ DSizes<Index, NumInputDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device& m_device;
@@ -342,12 +421,20 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = false,
+ IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- RawAccess = false
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+ InputTensorBlock;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ OutputTensorBlock;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
@@ -395,6 +482,50 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
}
}
}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const OutputTensorBlock& output_block) {
+ // Calculate input block sizes.
+ const DSizes<Index, NumDims>& output_block_sizes =
+ output_block.block_sizes();
+ const DSizes<Index, NumDims>& output_block_strides =
+ output_block.block_strides();
+ const Index chip_dim = this->m_dim.actualDim();
+ DSizes<Index, NumInputDims> input_block_sizes;
+ DSizes<Index, NumInputDims> input_block_strides;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (i < chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i];
+ input_block_strides[i] = output_block_strides[i];
+ } else if (i > chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i - 1];
+ input_block_strides[i] = output_block_strides[i - 1];
+ } else {
+ input_block_sizes[i] = 1;
+ }
+ }
+ // Fix up input_block_stride for chip dimension.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (chip_dim == 0) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+ }
+ } else {
+ if (chip_dim == NumInputDims - 1) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+ }
+ }
+ // Write input block.
+ this->m_impl.writeBlock(InputTensorBlock(
+ this->srcCoeff(output_block.first_coeff_index()), input_block_sizes,
+ input_block_strides, this->m_inputStrides,
+ const_cast<ScalarNoConst*>(output_block.data())));
+ }
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 27c92d8f6..3863ee8c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
@@ -308,6 +309,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 4b24e5fc1..f0f61fade 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -235,6 +235,7 @@ struct TensorContractionEvaluatorBase
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a7751eee1..1f613d3c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -196,6 +196,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
IsAligned = false,
PacketAccess = true,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index a07e32db0..2d0e6599f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -308,6 +308,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -780,6 +781,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index d301d0c01..e79958fc9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -243,6 +243,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index cbec5e9b4..6ee3827f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -256,6 +257,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 256d499f2..554ee5f59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -108,6 +108,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 028902fea..4ca6b3d8c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,6 +43,7 @@ struct TensorEvaluator
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
@@ -195,6 +196,7 @@ struct TensorEvaluator<const Derived, Device>
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
@@ -288,6 +290,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -351,27 +354,34 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
enum {
- IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
- internal::functor_traits<UnaryOp>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::functor_traits<UnaryOp>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- : m_functor(op.functor()),
+ : m_device(device),
+ m_functor(op.functor()),
m_argImpl(op.nestedExpression(), device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ static const int NumDims = internal::array_size<Dimensions>::value;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlock;
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -399,6 +409,29 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_argImpl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ if (NumDims <= 0) {
+ output_block->data()[0] = coeff(0);
+ return;
+ }
+ internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
+ *output_block);
+ internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
+ Layout>::Run(m_functor,
+ output_block->block_sizes(),
+ output_block
+ ->block_strides(),
+ output_block->data(),
+ arg_block.block_strides(),
+ arg_block.data());
+ }
+
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
@@ -408,6 +441,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
private:
+ const Device& m_device;
const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
@@ -421,16 +455,18 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum {
- IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
- TensorEvaluator<RightArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
- TensorEvaluator<RightArgType, Device>::PacketAccess &
- internal::functor_traits<BinaryOp>::PacketAccess,
- BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
- TensorEvaluator<RightArgType, Device>::BlockAccess,
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+ TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+ TensorEvaluator<RightArgType, Device>::PacketAccess &
+ internal::functor_traits<BinaryOp>::PacketAccess,
+ BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+ TensorEvaluator<RightArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+ TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -501,7 +537,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
TensorBlock* output_block) const {
if (NumDims <= 0) {
- output_block->data()[0] = coeff(0);
+ output_block->data()[0] = coeff(Index(0));
return;
}
internal::TensorBlockView<LeftArgType, Device> left_block(
@@ -543,6 +579,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -648,6 +685,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
PacketType<Scalar, Device>::HasBlend,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b756be3b3..ba5ab1396 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -115,6 +115,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
const DefaultDevice& device = DefaultDevice()) {
typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
+ typedef typename TensorBlock::Dimensions TensorBlockDimensions;
Evaluator evaluator(expr, device);
Index total_size = array_prod(evaluator.dimensions());
@@ -138,8 +139,9 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
evaluator.getResourceRequirements(&resources);
MergeResourceRequirements(resources, &block_shape, &block_total_size);
- TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
- block_total_size);
+ TensorBlockMapper block_mapper(
+ TensorBlockDimensions(evaluator.dimensions()), block_shape,
+ block_total_size);
block_total_size = block_mapper.block_dims_total_size();
Scalar* data = static_cast<Scalar*>(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index d6ab4d997..480cf1f39 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -136,6 +136,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
IsAligned = false,
PacketAccess = true,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 1342e47a6..71ba56773 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 2778bf5ec..edf9f85d8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -99,6 +99,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 93a3b0e14..04a8b953d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -140,7 +140,11 @@ struct IsVectorizable<GpuDevice, Expression> {
template <typename Device, typename Expression>
struct IsTileable {
- static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+ // Check that block evaluation is supported and it's a preferred option (at
+ // least one sub-expression has much faster block evaluation, e.g.
+ // broadcasting).
+ static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
+ TensorEvaluator<Expression, Device>::PreferBlockAccess;
};
template <typename Expression, typename Device,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 97c8d4a02..95c9e6aee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 00e1186e5..965bd8f1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -54,6 +54,66 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
};
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::Impl Impl;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+ Scalar* dst_data, const Index src_index) {
+ const Impl& impl = self.impl();
+ for (Index i = 0; i < num_coeff_to_copy; ++i) {
+ dst_data[dst_index + i] = impl.coeff(src_index + i);
+ }
+ }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::Impl Impl;
+ typedef typename packet_traits<Scalar>::type Packet;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+ Scalar* dst_data, const Index src_index) {
+ const Impl& impl = self.impl();
+ const Index packet_size = internal::unpacket_traits<Packet>::size;
+ const Index vectorized_size =
+ (num_coeff_to_copy / packet_size) * packet_size;
+ for (Index i = 0; i < vectorized_size; i += packet_size) {
+ Packet p = impl.template packet<Unaligned>(src_index + i);
+ internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+ }
+ for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+ dst_data[dst_index + i] = impl.coeff(src_index + i);
+ }
+ }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename packet_traits<Scalar>::type Packet;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Index num_coeff_to_pad, const Scalar padding_value,
+ const Index dst_index, Scalar* dst_data) {
+ const Index packet_size = internal::unpacket_traits<Packet>::size;
+ const Packet padded_packet = internal::pset1<Packet>(padding_value);
+ const Index vectorized_size =
+ (num_coeff_to_pad / packet_size) * packet_size;
+ for (Index i = 0; i < vectorized_size; i += packet_size) {
+ internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+ padded_packet);
+ }
+ for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+ dst_data[dst_index + i] = padding_value;
+ }
+ }
+};
+
} // end namespace internal
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -184,20 +244,24 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false,
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = true,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
};
- #ifdef __SYCL_DEVICE_ONLY__
+ typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
+ OutputTensorBlock;
+
+#ifdef __SYCL_DEVICE_ONLY__
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
#else
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
#endif
- : m_impl(op.expression(), device)
+ : m_device(device), m_impl(op.expression(), device)
#ifdef EIGEN_USE_SYCL
, m_op(op)
#endif
@@ -484,6 +548,147 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
+ typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
+
+ // Calculate loop limits and various input/output dim sizes.
+ const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
+ const bool col_major =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+ const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
+ const Index output_depth_dim_size =
+ m_dimensions[col_major ? 0 : NumDims - 1];
+ const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
+ const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
+ const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
+ const Index block_col_stride = row_dim_size * depth_dim_size;
+ const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
+ const Index outer_dim_size =
+ block_sizes.TotalSize() /
+ (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
+
+ const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
+ const Index batch_size = patch_size * patch_index_dim_size;
+
+ Index output_index = output_block->first_coeff_index();
+
+ // Loop through outer dimensions.
+ for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
+ ++outer_dim_index) {
+ const Index outer_output_base_index = outer_dim_index * batch_size;
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchIndexStart = output_index / m_fastPatchStride;
+ const Index patchOffset =
+ (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
+ const Index colOffsetStart = patchOffset / m_fastColStride;
+ // Other ways to index this element.
+ const Index otherIndex =
+ (NumDims == 4) ? 0 : output_index / m_fastOtherStride;
+ const Index patch2DIndexStart =
+ (NumDims == 4)
+ ? 0
+ : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
+ // Calculate starting depth index.
+ const Index depth = output_index - (output_index / m_fastOutputDepth) *
+ output_depth_dim_size;
+ const Index patch_input_base_index =
+ depth + otherIndex * m_patchInputStride;
+
+ // Loop through patches.
+ for (Index patch_index_dim_index = 0;
+ patch_index_dim_index < patch_index_dim_size;
+ ++patch_index_dim_index) {
+ const Index patch_output_base_index =
+ outer_output_base_index + patch_index_dim_index * patch_size;
+ // Patch index corresponding to the passed in index.
+ const Index patchIndex = patchIndexStart + patch_index_dim_index;
+ const Index patch2DIndex =
+ (NumDims == 4) ? patchIndex
+ : patch2DIndexStart + patch_index_dim_index;
+ const Index colIndex = patch2DIndex / m_fastOutputRows;
+ const Index input_col_base = colIndex * m_col_strides;
+ const Index row_offset_base =
+ (patch2DIndex - colIndex * m_outputRows) * m_row_strides -
+ m_rowPaddingTop;
+
+ // Loop through columns.
+ for (Index col_dim_index = 0; col_dim_index < col_dim_size;
+ ++col_dim_index) {
+ const Index col_output_base_index =
+ patch_output_base_index + col_dim_index * block_col_stride;
+
+ // Calculate col index in the input original tensor.
+ Index colOffset = colOffsetStart + col_dim_index;
+ Index inputCol =
+ input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
+ Index origInputCol =
+ (m_col_inflate_strides == 1)
+ ? inputCol
+ : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+
+ bool pad_column = false;
+ if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+ ((m_col_inflate_strides != 1) &&
+ (inputCol != origInputCol * m_col_inflate_strides))) {
+ pad_column = true;
+ }
+
+ const Index col_input_base_index =
+ patch_input_base_index + origInputCol * m_colInputStride;
+ const Index input_row_base =
+ row_offset_base +
+ ((patchOffset + col_dim_index * output_row_dim_size) -
+ colOffset * m_colStride) *
+ m_in_row_strides;
+ // Loop through rows.
+ for (Index row_dim_index = 0; row_dim_index < row_dim_size;
+ ++row_dim_index) {
+ const Index output_base_index =
+ col_output_base_index + row_dim_index * depth_dim_size;
+ bool pad_row = false;
+ Index inputIndex;
+ if (!pad_column) {
+ Index inputRow =
+ input_row_base + row_dim_index * m_in_row_strides;
+ Index origInputRow =
+ (m_row_inflate_strides == 1)
+ ? inputRow
+ : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
+ : 0);
+ if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+ ((m_row_inflate_strides != 1) &&
+ (inputRow != origInputRow * m_row_inflate_strides))) {
+ pad_row = true;
+ } else {
+ inputIndex =
+ col_input_base_index + origInputRow * m_rowInputStride;
+ }
+ }
+ // Copy (or pad) along depth dimension.
+ if (pad_column || pad_row) {
+ ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
+ output_base_index, output_block->data());
+ } else {
+ ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
+ output_block->data(), inputIndex);
+ }
+ }
+ }
+ }
+ output_index += m_otherStride;
+ }
+ }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
@@ -539,6 +744,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
Scalar m_paddingValue;
+ const Device& m_device;
TensorEvaluator<ArgType, Device> m_impl;
#ifdef EIGEN_USE_SYCL
// Required for SYCL in order to construct the expression tree on the device
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 64f2ad81f..e28565009 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -91,6 +91,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index e3165fa10..998757d14 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -120,6 +120,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -183,6 +184,7 @@ template<typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 4dd2e7c86..16dc74afe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -102,27 +102,69 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
typedef NewDimensions Dimensions;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ static const int NumOutputDims = internal::array_size<Dimensions>::value;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
enum {
- IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
+ // performance issue with block-based reshape is resolved.
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
+ TensorEvaluator<ArgType, Device>::RawAccess &&
+ NumInputDims > 0 && NumOutputDims > 0,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+ InputTensorBlock;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+ OutputTensorBlock;
+ typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
+ Layout>
+ OutputTensorBlockReader;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
{
// The total size of the reshaped tensor must be equal to the total size
// of the input tensor.
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
- }
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ if (BlockAccess) {
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+ m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumOutputDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ }
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+ }
+ } else {
+ m_outputStrides[NumOutputDims - 1] = 1;
+ for (int i = NumOutputDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ }
+ m_inputStrides[NumInputDims - 1] = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+ }
+ }
+ }
+ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -148,6 +190,140 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
return m_impl.costPerCoeff(vectorized);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_impl.getResourceRequirements(resources);
+ }
+
+ // TODO(andydavis) Reduce the overhead of this function.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ if (m_impl.data() != NULL) {
+ OutputTensorBlockReader::Run(output_block, m_impl.data());
+ return;
+ }
+
+ // Calculate output block unit-stride inner dimension length.
+ const DSizes<Index, NumOutputDims>& output_block_sizes =
+ output_block->block_sizes();
+ Index output_inner_dim_size = 1;
+ Index output_outer_dim_start = NumOutputDims;
+ for (Index i = 0; i < NumOutputDims; ++i) {
+ const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i : NumOutputDims - i - 1;
+ output_inner_dim_size *= output_block_sizes[dim];
+ if (output_block_sizes[dim] < m_dimensions[dim]) {
+ output_outer_dim_start = i + 1;
+ break;
+ }
+ }
+
+ // Initialize output block iterator state.
+ struct BlockIteratorState {
+ Index stride;
+ Index span;
+ Index size;
+ Index count;
+ };
+ array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+ for (Index i = 0; i < NumOutputDims; ++i) {
+ const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i : NumOutputDims - i - 1;
+ block_iter_state[i].size = output_block_sizes[dim];
+ block_iter_state[i].stride = m_outputStrides[dim];
+ block_iter_state[i].span =
+ block_iter_state[i].stride * (block_iter_state[i].size - 1);
+ block_iter_state[i].count = 0;
+ }
+
+ const Index output_outer_dim_size = output_block_sizes.TotalSize() /
+ output_inner_dim_size;
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+ m_impl.dimensions();
+
+ Index index = output_block->first_coeff_index();
+ for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
+ Index inner_idx = 0;
+ while (inner_idx < output_inner_dim_size) {
+ // Calculate input coords based on 'index'.
+ array<Index, NumInputDims> input_coords;
+ Index idx = index;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumInputDims - 1; i > 0; --i) {
+ input_coords[i] = idx / m_inputStrides[i];
+ idx -= input_coords[i] * m_inputStrides[i];
+ }
+ input_coords[0] = idx;
+ } else {
+ for (int i = 0; i < NumInputDims - 1; ++i) {
+ input_coords[i] = idx / m_inputStrides[i];
+ idx -= input_coords[i] * m_inputStrides[i];
+ }
+ input_coords[NumInputDims - 1] = idx;
+ }
+
+ // Calculate target input block shape, using at most
+ // 'output_inner_dim_size' coefficients along the input block's inner
+ // dimensions.
+ DSizes<Index, NumInputDims> input_block_sizes;
+ Index num_to_allocate = output_inner_dim_size - inner_idx;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ const Index dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i : NumInputDims - i - 1;
+ input_block_sizes[dim] = numext::mini(
+ num_to_allocate, (static_cast<Index>(input_dims[dim]) -
+ input_coords[dim]));
+ if (input_coords[dim] == 0) {
+ num_to_allocate /= input_block_sizes[dim];
+ } else {
+ num_to_allocate = 1;
+ }
+ }
+
+ // Calculate input block strides.
+ DSizes<Index, NumInputDims> input_block_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_block_strides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ input_block_strides[i] = input_block_strides[i - 1] *
+ input_block_sizes[i - 1];
+ }
+ } else {
+ input_block_strides[NumInputDims - 1] = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ input_block_strides[i] = input_block_strides[i + 1] *
+ input_block_sizes[i + 1];
+ }
+ }
+
+ // Instantiate and read input block from input tensor.
+ InputTensorBlock input_block(index, input_block_sizes,
+ input_block_strides, m_inputStrides,
+ output_block->data() + outer_idx *
+ output_inner_dim_size + inner_idx);
+
+ m_impl.block(&input_block);
+
+ const Index input_block_total_size = input_block_sizes.TotalSize();
+ index += input_block_total_size;
+ inner_idx += input_block_total_size;
+ }
+ eigen_assert(inner_idx == output_inner_dim_size);
+ index -= output_inner_dim_size;
+ // Update index.
+ for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
+ if (++block_iter_state[i].count < block_iter_state[i].size) {
+ index += block_iter_state[i].stride;
+ break;
+ }
+ block_iter_state[i].count = 0;
+ index -= block_iter_state[i].span;
+ }
+ }
+ }
+
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -155,6 +331,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
protected:
TensorEvaluator<ArgType, Device> m_impl;
NewDimensions m_dimensions;
+ DSizes<Index, NumOutputDims> m_outputStrides;
+ DSizes<Index, NumInputDims> m_inputStrides;
};
@@ -172,6 +350,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -322,17 +501,29 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
static const int NumDims = internal::array_size<Sizes>::value;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Sizes Dimensions;
+
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets and sizes.
- IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false,
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
+ typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
{
@@ -340,6 +531,16 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
}
+ m_is_identity = true;
+ for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_impl.dimensions()[i] >=
+ op.sizes()[i] + op.startIndices()[i]);
+ if (m_impl.dimensions()[i] != op.sizes()[i] ||
+ op.startIndices()[i] != 0) {
+ m_is_identity = false;
+ }
+ }
+
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Sizes& output_dims = op.sizes();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -369,12 +570,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
}
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef Sizes Dimensions;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -417,7 +612,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- return m_impl.coeff(srcCoeff(index));
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
}
template<int LoadMode>
@@ -427,6 +626,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+ if (m_is_identity) {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -469,9 +672,27 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
- return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ m_impl.getResourceRequirements(resources);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+ output_block->block_sizes(),
+ output_block->block_strides(),
+ TensorBlockDimensions(m_inputStrides),
+ output_block->data());
+ m_impl.block(&input_block);
+ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
Scalar* result = m_impl.data();
@@ -544,6 +765,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;
Dimensions m_dimensions;
+ bool m_is_identity;
const StartIndices m_offsets;
};
@@ -557,33 +779,48 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
static const int NumDims = internal::array_size<Sizes>::value;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Sizes Dimensions;
+
enum {
- IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false,
- RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
+ typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef Sizes Dimensions;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
- return this->m_impl.coeffRef(this->srcCoeff(index));
+ if (this->m_is_identity) {
+ return this->m_impl.coeffRef(index);
+ } else {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
+ if (this->m_is_identity) {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ return;
+ }
+
const int packetSize = PacketType<CoeffReturnType, Device>::size;
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
@@ -623,9 +860,15 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
}
}
}
-};
-
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlock& block) {
+ this->m_impl.writeBlock(TensorBlock(
+ this->srcCoeff(block.first_coeff_index()), block.block_sizes(),
+ block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
+ const_cast<ScalarNoConst*>(block.data())));
+ }
+};
namespace internal {
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -730,6 +973,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
@@ -739,7 +983,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
{
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+ m_is_identity = true;
for (Index i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ if (m_strides[i] != 1 || op.startIndices()[i] != 0 ||
+ op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) {
+ m_is_identity = false;
+ }
+
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
if(m_strides[i]>0){
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
@@ -803,9 +1053,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
}
}
- m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
- device.lastLevelCacheSize() /
- sizeof(Scalar));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -822,11 +1069,15 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- return m_impl.coeff(srcCoeff(index));
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
- return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
@@ -873,13 +1124,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
array<Index, NumDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
+ bool m_is_identity;
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;
DSizes<Index, NumDims> m_startIndices; // clamped startIndices
DSizes<Index, NumDims> m_dimensions;
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
const Strides m_strides;
- std::size_t m_block_total_size_max;
//use by sycl
const StartIndices m_exprStartIndices;
//use by sycl
@@ -899,6 +1150,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
RawAccess = false
@@ -916,7 +1168,11 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
- return this->m_impl.coeffRef(this->srcCoeff(index));
+ if (this->m_is_identity) {
+ return this->m_impl.coeffRef(index);
+ } else {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index aa1db3c73..59c1704ed 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a0a1ad8f4..4292fe0c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index f5b969ce5..8d93aacee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -14,7 +14,7 @@
// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
// so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
#define KERNEL_FRIEND friend __global__
#else
#define KERNEL_FRIEND friend
@@ -421,6 +421,70 @@ template <int NPT, typename S, typename R, typename I>
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
+template <typename Self, typename Op,
+ bool Vectorizable =
+ (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
+class BlockReducer {
+ public:
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::PacketReturnType PacketReturnType;
+ explicit BlockReducer(const Op& reducer) : op_(reducer) {
+ accum_ = op_.initialize();
+ }
+ void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+ for (Index i = 0; i < num_values_to_reduce; ++i) {
+ op_.reduce(data[index + i], &accum_);
+ }
+ }
+ CoeffReturnType Finalize() { return op_.finalize(accum_); }
+ PacketReturnType FinalizePacket() {
+ // TODO(andydavis) This function should not be called for Scalar
+ // reductions: clean this up or add an assert here.
+ return PacketReturnType();
+ }
+
+ private:
+ CoeffReturnType accum_;
+ Op op_;
+};
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::PacketReturnType PacketReturnType;
+ static const Index PacketSize =
+ internal::unpacket_traits<PacketReturnType>::size;
+
+ explicit BlockReducer(const Op& reducer) : op_(reducer) {
+ vaccum_ = op_.template initializePacket<PacketReturnType>();
+ accum_ = op_.initialize();
+ }
+ void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+ const Index vectorized_size =
+ (num_values_to_reduce / PacketSize) * PacketSize;
+ for (Index i = 0; i < vectorized_size; i += PacketSize) {
+ op_.reducePacket(
+ internal::ploadt<PacketReturnType, Unaligned>(&data[index + i]),
+ &vaccum_);
+ }
+ for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
+ op_.reduce(data[index + i], &accum_);
+ }
+ }
+ CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); }
+ PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); }
+
+ private:
+ PacketReturnType vaccum_;
+ CoeffReturnType accum_;
+ Op op_;
+};
+
} // end namespace internal
@@ -479,11 +543,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+ OutputTensorBlock;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+ InputTensorBlock;
+
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);
@@ -517,11 +589,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
m_outputStrides[0] = 1;
for (int i = 1; i < NumOutputDims; ++i) {
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
- m_outputStrides.back() = 1;
+ m_outputStrides[NumOutputDims - 1] = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}
}
@@ -549,6 +623,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
++reduceIndex;
} else {
m_preservedStrides[outputIndex] = input_strides[i];
+ m_output_to_input_dim_map[outputIndex] = i;
++outputIndex;
}
}
@@ -558,6 +633,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
if (NumOutputDims == 0) {
m_preservedStrides[0] = internal::array_prod(input_dims);
}
+
+ m_numValuesToReduce =
+ NumOutputDims == 0
+ ? internal::array_prod(input_dims)
+ : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+ ? m_preservedStrides[0]
+ : m_preservedStrides[NumOutputDims - 1];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -752,6 +834,266 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ // Special case full reductions to avoid input block copy below.
+ if (NumInputDims == NumReducedDims) {
+ eigen_assert(output_block->first_coeff_index() == 0);
+ eigen_assert(output_block->block_sizes().TotalSize() == 1);
+ Op reducer(m_reducer);
+ output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
+ *this, 0, m_numValuesToReduce, reducer);
+ return;
+ }
+
+ // Calculate input tensor 'slice' required to reduce output block coeffs.
+ DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
+ for (int i = 0; i < NumOutputDims; ++i) {
+ // Clip preserved input dimensions by output block size.
+ input_slice_sizes[m_output_to_input_dim_map[i]] =
+ output_block->block_sizes()[i];
+ }
+
+ // Shard input tensor slice into blocks (because it could be large if we
+ // need to reduce along several dimensions to calculate required output
+ // coefficients).
+ const Index max_coeff_count =
+ numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
+ input_slice_sizes.TotalSize());
+
+ // Calculate max output shard size needed to keep working set of reducers
+ // in L1, while leaving enough space for reducer overhead and 'PacketSize'
+ // reductions.
+ DSizes<Index, NumInputDims> target_input_block_sizes;
+ CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
+ &target_input_block_sizes);
+ // Calculate indices for first preserved dimension.
+ const Index first_preserved_dim_output_index =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumOutputDims - 1;
+ const Index first_preserved_dim_input_index =
+ m_output_to_input_dim_map[first_preserved_dim_output_index];
+ const bool inner_most_dim_preserved =
+ first_preserved_dim_input_index ==
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumInputDims - 1) |
+ PreservingInnerMostDims;
+
+ // Calculate output block inner/outer dimension sizes.
+ const Index output_block_inner_dim_size =
+ output_block->block_sizes()[first_preserved_dim_output_index];
+ const Index output_block_outer_dim_size =
+ output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
+ // Calculate shard size for first preserved dimension.
+ const Index output_shard_size =
+ target_input_block_sizes[first_preserved_dim_input_index];
+ const Index num_output_shards =
+ (output_block_inner_dim_size + output_shard_size - 1) /
+ output_shard_size;
+
+ // Initialize 'tensor_slice_offsets' from input coords of output index.
+ DSizes<Index, NumInputDims> tensor_slice_offsets;
+ GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
+ &tensor_slice_offsets);
+
+ // Store tensor slice offset in first preserved dimension to be used
+ // to update tensor slice extents in loop below.
+ const Index first_preserved_dim_offset_start =
+ tensor_slice_offsets[first_preserved_dim_input_index];
+
+ array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+ // Initialize state used to iterate through output coefficients
+ // and update 'tensor_slice_offsets' in outer preserved dims.
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i + 1
+ : NumOutputDims - i - 2;
+ block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
+ block_iter_state[i].output_size = output_block->block_sizes()[dim];
+ block_iter_state[i].output_count = 0;
+ }
+
+ // Allocate input block memory.
+ ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
+ m_device.allocate(max_coeff_count * sizeof(Scalar)));
+ // Allocate reducer memory.
+ const bool packet_reductions_enabled =
+ (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
+ const Index num_reducers =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
+ PacketSize)
+ : output_shard_size;
+ typedef internal::BlockReducer<Self, Op> BlockReducer;
+ BlockReducer* reducers = static_cast<BlockReducer*>(
+ m_device.allocate(num_reducers * sizeof(BlockReducer)));
+
+ InputDimensions input_tensor_dims(m_impl.dimensions());
+ for (Index output_outer_index = 0;
+ output_outer_index < output_block_outer_dim_size;
+ ++output_outer_index) {
+ for (Index output_shard_index = 0; output_shard_index < num_output_shards;
+ ++output_shard_index) {
+ // Initialize 'tensor_slice_extents' for this output shard.
+ DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (i == first_preserved_dim_input_index) {
+ // Clip first preserved dim size to output shard size.
+ tensor_slice_extents[i] = numext::mini(
+ output_shard_size,
+ input_slice_sizes[i] - (tensor_slice_offsets[i] -
+ first_preserved_dim_offset_start));
+
+ } else if (!m_reduced[i]) {
+ // Clip outer preserved dims to size 1, so that we reduce a
+ // contiguous set of output coefficients.
+ tensor_slice_extents[i] = 1;
+ }
+ }
+
+ // Intialize output coefficient reducers.
+ for (int i = 0; i < num_reducers; ++i) {
+ new (&reducers[i]) BlockReducer(m_reducer);
+ }
+
+ typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
+ NumInputDims, Layout>
+ TensorSliceBlockMapper;
+
+ // TODO(andydavis) Consider removing 'input_block_stride_order' if we
+ // find that scattered reads are not worth supporting in
+ // TensorSliceBlockMapper.
+ TensorSliceBlockMapper block_mapper(
+ input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
+ target_input_block_sizes, DimensionList<Index, NumInputDims>());
+
+ const Index num_outputs_to_update =
+ tensor_slice_extents[first_preserved_dim_input_index];
+ const Index preserved_dim_vector_reducer_count =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? num_outputs_to_update / PacketSize
+ : 0;
+ const Index preserved_dim_vector_coeff_count =
+ inner_most_dim_preserved
+ ? preserved_dim_vector_reducer_count * PacketSize
+ : 0;
+ const Index preserved_dim_reducer_limit =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? (preserved_dim_vector_reducer_count +
+ num_outputs_to_update % PacketSize)
+ : num_outputs_to_update;
+
+ const Index total_block_count = block_mapper.total_block_count();
+ for (Index b = 0; b < total_block_count; ++b) {
+ InputTensorBlock input_block =
+ block_mapper.GetBlockForIndex(b, input_block_data);
+ // Read.
+ m_impl.block(&input_block);
+
+ Index num_values_to_reduce = 1;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ num_values_to_reduce *= input_block.block_sizes()[i];
+ }
+ }
+ // Reduce.
+ if (inner_most_dim_preserved) {
+ const Index input_outer_dim_size =
+ input_block.block_sizes().TotalSize() / num_outputs_to_update;
+ for (Index input_outer_dim_index = 0;
+ input_outer_dim_index < input_outer_dim_size;
+ ++input_outer_dim_index) {
+ const Index input_outer_dim_base =
+ input_outer_dim_index * num_outputs_to_update;
+ for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+ reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
+ PacketSize, input_block.data());
+ }
+ const Index scalar_reducer_base =
+ input_outer_dim_base + preserved_dim_vector_coeff_count;
+ for (Index i = preserved_dim_vector_reducer_count;
+ i < preserved_dim_reducer_limit; ++i) {
+ reducers[i].Reduce(scalar_reducer_base + i -
+ preserved_dim_vector_reducer_count,
+ 1, input_block.data());
+ }
+ }
+ } else {
+ for (Index i = 0; i < num_outputs_to_update; ++i) {
+ reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
+ input_block.data());
+ }
+ }
+ }
+
+ // Finalize all reducers for this output shard.
+ const Index output_base_index =
+ output_outer_index * output_block_inner_dim_size +
+ output_shard_index * output_shard_size;
+ if (inner_most_dim_preserved) {
+ EIGEN_ALIGN_MAX
+ typename internal::remove_const<CoeffReturnType>::type
+ values[PacketSize];
+ for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+ const Index reducer_base = output_base_index + i * PacketSize;
+ internal::pstore<CoeffReturnType, PacketReturnType>(
+ values, reducers[i].FinalizePacket());
+ for (Index j = 0; j < PacketSize; ++j) {
+ output_block->data()[reducer_base + j] = values[j];
+ }
+ }
+ const Index scalar_reducer_base =
+ output_base_index + preserved_dim_vector_coeff_count;
+
+ for (Index i = preserved_dim_vector_reducer_count;
+ i < preserved_dim_reducer_limit; ++i) {
+ output_block->data()[scalar_reducer_base + i -
+ preserved_dim_vector_reducer_count] =
+ reducers[i].Finalize();
+ }
+ } else {
+ for (int i = 0; i < num_outputs_to_update; ++i) {
+ output_block->data()[output_base_index + i] =
+ reducers[i].Finalize();
+ }
+ }
+
+ // Update 'tensor_slice_offsets' by num outputs for this output shard.
+ tensor_slice_offsets[first_preserved_dim_input_index] +=
+ num_outputs_to_update;
+ }
+ // Update slice offset for inner preserved dim.
+ tensor_slice_offsets[first_preserved_dim_input_index] -=
+ output_block_inner_dim_size;
+ // Update slice offsets for remaining output dims.
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ BlockIteratorState& b = block_iter_state[i];
+ if (++b.output_count < b.output_size) {
+ ++tensor_slice_offsets[b.input_dim];
+ break;
+ }
+ b.output_count = 0;
+ tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
+ }
+ }
+
+ // Free memory.
+ m_device.deallocate(input_block_data);
+ m_device.deallocate(reducers);
+ }
+
EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
#if defined(EIGEN_USE_SYCL)
@@ -788,6 +1130,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+ struct BlockIteratorState {
+ Index input_dim;
+ Index output_size;
+ Index output_count;
+ };
+
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -830,16 +1178,88 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
return startInput;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
+ Index index,
+ DSizes<Index, NumInputDims>* coords) const {
+ for (int i = 0; i < NumInputDims; ++i) {
+ (*coords)[i] = 0;
+ }
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ (*coords)[m_output_to_input_dim_map[i]] = idx;
+ index -= idx * m_outputStrides[i];
+ }
+ (*coords)[m_output_to_input_dim_map[0]] = index;
+ } else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ (*coords)[m_output_to_input_dim_map[i]] = idx;
+ index -= idx * m_outputStrides[i];
+ }
+ (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
+ const Index max_coeff_count,
+ const DSizes<Index, NumInputDims>& input_slice_sizes,
+ DSizes<Index, NumInputDims>* target_input_block_sizes) const {
+ typedef internal::BlockReducer<Self, Op> BlockReducer;
+ // TODO(andydavis) Compute reducer overhead correctly for the case where
+ // we are preserving the inner most dimension, and a single reducer
+ // reduces a packet's worth of output coefficients.
+ const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
+
+ Index coeff_to_allocate = max_coeff_count;
+ bool first_preserved_dim_allocated = false;
+ bool first_reduced_dim_allocated = false;
+ for (int i = 0; i < NumInputDims; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i
+ : NumInputDims - i - 1;
+ (*target_input_block_sizes)[dim] = 1;
+ if (m_reduced[dim]) {
+ // TODO(andydavis) Consider allocating to multiple reduced dimensions.
+ // Watch out for cases where reduced dimensions are not contiguous,
+ // which induces scattered reads.
+ if (!first_reduced_dim_allocated) {
+ (*target_input_block_sizes)[dim] =
+ numext::mini(input_slice_sizes[dim], coeff_to_allocate);
+ coeff_to_allocate /= (*target_input_block_sizes)[dim];
+ first_reduced_dim_allocated = true;
+ }
+ } else if (!first_preserved_dim_allocated) {
+ // TODO(andydavis) Include output block size in this L1 working set
+ // calculation.
+ const Index alloc_size = numext::maxi(
+ static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
+ (*target_input_block_sizes)[dim] =
+ numext::mini(input_slice_sizes[dim], alloc_size);
+ coeff_to_allocate = numext::maxi(
+ static_cast<Index>(1),
+ coeff_to_allocate /
+ ((*target_input_block_sizes)[dim] * reducer_overhead));
+ first_preserved_dim_allocated = true;
+ }
+ }
+ }
+
// Bitmap indicating if an input dimension is reduced or not.
array<bool, NumInputDims> m_reduced;
// Dimensions of the output of the operation.
Dimensions m_dimensions;
// Precomputed strides for the output tensor.
array<Index, NumOutputDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
// Subset of strides of the input tensor for the non-reduced dimensions.
// Indexed by output dimensions.
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
array<Index, NumPreservedStrides> m_preservedStrides;
+ // Map from output to input dimension index.
+ array<Index, NumOutputDims> m_output_to_input_dim_map;
+ // How many values go into each reduction
+ Index m_numValuesToReduce;
// Subset of strides of the input tensor for the reduced dimensions.
// Indexed by reduced dimensions.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index a6cade50f..6e15e75f9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -137,6 +137,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -366,6 +367,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -414,6 +416,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ PreferBlockAccess = false,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 9193bdd8e..b7fb969f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -114,6 +114,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -255,6 +256,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index b1135f297..641366d9d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e25dd9cf8..e018d0ab2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
template<typename Shuffle, typename ArgType, typename Device>
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
{
+ typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -110,43 +111,62 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = false,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlock;
+ typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockReader;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+ const Device& device)
+ : m_device(device),
+ m_impl(op.expression(), device),
+ m_shuffle(op.shufflePermutation())
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation();
+ m_is_identity = true;
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] = input_dims[shuffle[i]];
+ m_inverseShuffle[shuffle[i]] = i;
+ if (m_is_identity && shuffle[i] != i) {
+ m_is_identity = false;
+ }
}
- array<Index, NumDims> inputStrides;
-
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- inputStrides[0] = 1;
+ m_unshuffledInputStrides[0] = 1;
m_outputStrides[0] = 1;
+
for (int i = 1; i < NumDims; ++i) {
- inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+ m_unshuffledInputStrides[i] =
+ m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
- inputStrides[NumDims - 1] = 1;
+ m_unshuffledInputStrides[NumDims - 1] = 1;
m_outputStrides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
- inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+ m_unshuffledInputStrides[i] =
+ m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}
for (int i = 0; i < NumDims; ++i) {
- m_inputStrides[i] = inputStrides[shuffle[i]];
+ m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
}
}
@@ -162,29 +182,152 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- return m_impl.coeff(srcCoeff(index));
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
}
+ template <int LoadMode, typename Self, bool ImplPacketAccess>
+ struct PacketLoader {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ static PacketReturnType Run(const Self& self, Index index) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = self.coeff(index + i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ };
+
+ template<int LoadMode, typename Self>
+ struct PacketLoader<LoadMode, Self, true> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ static PacketReturnType Run(const Self& self, Index index) {
+ if (self.m_is_identity) {
+ return self.m_impl.template packet<LoadMode>(index);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = self.coeff(index + i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+ };
+
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+ return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+ }
- EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
- for (int i = 0; i < PacketSize; ++i) {
- values[i] = coeff(index+i);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.firstLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kUniformAllDims, block_total_size_max));
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ if (m_impl.data() != NULL) {
+ // Fast path: we have direct access to the data, so shuffle as we read.
+ TensorBlockReader::Run(output_block,
+ srcCoeff(output_block->first_coeff_index()),
+ m_inverseShuffle,
+ m_unshuffledInputStrides,
+ m_impl.data());
+ return;
+ }
+
+ // Slow path: read unshuffled block from the input and shuffle in-place.
+ // Initialize input block sizes using input-to-output shuffle map.
+ DSizes<Index, NumDims> input_block_sizes;
+ for (Index i = 0; i < NumDims; ++i) {
+ input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
+ }
+
+ // Calculate input block strides.
+ DSizes<Index, NumDims> input_block_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_block_strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ input_block_strides[i] =
+ input_block_strides[i - 1] * input_block_sizes[i - 1];
+ }
+ } else {
+ input_block_strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ input_block_strides[i] =
+ input_block_strides[i + 1] * input_block_sizes[i + 1];
+ }
+ }
+
+ // Read input block.
+ TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+ input_block_sizes,
+ input_block_strides,
+ Dimensions(m_unshuffledInputStrides),
+ output_block->data());
+
+ m_impl.block(&input_block);
+
+ // Naive In-place shuffle: random IO but block size is O(L1 cache size).
+ // TODO(andydavis) Improve the performance of this in-place shuffle.
+ const Index total_size = input_block_sizes.TotalSize();
+ std::vector<bool> bitmap(total_size, false);
+ ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
+ const DSizes<Index, NumDims>& output_block_strides =
+ output_block->block_strides();
+ for (Index input_index = 0; input_index < total_size; ++input_index) {
+ if (bitmap[input_index]) {
+ // Coefficient at this index has already been shuffled.
+ continue;
+ }
+
+ Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
+ output_block_strides);
+ if (output_index == input_index) {
+ // Coefficient already in place.
+ bitmap[output_index] = true;
+ continue;
+ }
+
+ // The following loop starts at 'input_index', and shuffles
+ // coefficients into their shuffled location at 'output_index'.
+ // It skips through the array shuffling coefficients by following
+ // the shuffle cycle starting and ending a 'start_index'.
+ ScalarNoConst evicted_value;
+ ScalarNoConst shuffled_value = data[input_index];
+ do {
+ evicted_value = data[output_index];
+ data[output_index] = shuffled_value;
+ shuffled_value = evicted_value;
+ bitmap[output_index] = true;
+ output_index = GetBlockOutputIndex(output_index, input_block_strides,
+ output_block_strides);
+ } while (output_index != input_index);
+
+ data[output_index] = shuffled_value;
+ bitmap[output_index] = true;
}
- PacketReturnType rslt = internal::pload<PacketReturnType>(values);
- return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
- const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+ NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
TensorOpCost::DivCost<Index>());
return m_impl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+ TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
}
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
@@ -195,27 +338,58 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+ Index input_index,
+ const DSizes<Index, NumDims>& input_block_strides,
+ const DSizes<Index, NumDims>& output_block_strides) const {
+ Index output_index = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = input_index / input_block_strides[i];
+ output_index += idx * output_block_strides[m_inverseShuffle[i]];
+ input_index -= idx * input_block_strides[i];
+ }
+ return output_index + input_index *
+ output_block_strides[m_inverseShuffle[0]];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = input_index / input_block_strides[i];
+ output_index += idx * output_block_strides[m_inverseShuffle[i]];
+ input_index -= idx * input_block_strides[i];
+ }
+ return output_index + input_index *
+ output_block_strides[m_inverseShuffle[NumDims - 1]];
+ }
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = index / m_outputStrides[i];
+ const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[0];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx = index / m_outputStrides[i];
+ const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[NumDims - 1];
}
}
+
Dimensions m_dimensions;
+ bool m_is_identity;
+ array<Index, NumDims> m_inverseShuffle;
array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
+ array<Index, NumDims> m_unshuffledInputStrides;
+
+ const Device& m_device;
TensorEvaluator<ArgType, Device> m_impl;
/// required by sycl
Shuffle m_shuffle;
@@ -239,12 +413,21 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = false,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlock;
+ typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockWriter;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
@@ -265,6 +448,14 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
this->coeffRef(index+i) = values[i];
}
}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlock& block) {
+ eigen_assert(this->m_impl.data() != NULL);
+ TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
+ this->m_inverseShuffle,
+ this->m_unshuffledInputStrides, this->m_impl.data());
+ }
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 4b69072f2..221dc96c9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -275,6 +276,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index ea53bb04b..9fc54a4c0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 3c7d8bbc0..c1b7a58ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -200,6 +200,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 22c952ae1..2cac2d0f1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -169,9 +169,7 @@ class EventCount {
class Waiter {
friend class EventCount;
- // Align to 128 byte boundary to prevent false sharing with other Waiter
- // objects in the same vector.
- EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
+ std::atomic<Waiter*> next;
std::mutex mu;
std::condition_variable cv;
uint64_t epoch;
@@ -181,6 +179,9 @@ class EventCount {
kWaiting,
kSignaled,
};
+ // Pad past 128 byte boundary to prevent false sharing with other Waiter
+ // objects in the same vector.
+ char pad_[128];
};
private:
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index a41731c34..7229839ac 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -12,7 +12,8 @@
#if EIGEN_MAX_CPP_VER >= 11 && \
((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
- __has_feature(cxx_thread_local))
+ __has_feature(cxx_thread_local) || \
+ (EIGEN_COMP_MSVC >= 1900) )
#define EIGEN_THREAD_LOCAL static thread_local
#endif
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 13d959df4..2db914765 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -534,7 +534,8 @@ struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, Bi
EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
using namespace Eigen; \
- EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+ typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+ EIGEN_UNUSED_VARIABLE(sizeof(Scalar)); \
CODE; \
}
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 40e6227e5..ad12ae557 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -526,6 +526,114 @@ static void test_block_io_squeeze_ones() {
}
template <typename T, int NumDims, int Layout>
+static void test_block_cwise_unary_io_basic() {
+ typedef internal::scalar_square_op<T> UnaryFunctor;
+ typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
+ Layout>
+ TensorBlockCwiseUnaryIO;
+
+ DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
+ DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
+
+ const Index total_size = block_sizes.TotalSize();
+
+ // Create a random input tensors.
+ T* input_data = GenerateRandomData<T>(total_size);
+
+ T* output_data = new T[total_size];
+ UnaryFunctor functor;
+ TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+ strides, input_data);
+ for (int i = 0; i < total_size; ++i) {
+ VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+ }
+
+ delete[] input_data;
+ delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_squeeze_ones() {
+ typedef internal::scalar_square_op<float> UnaryFunctor;
+ typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+ Layout>
+ TensorBlockCwiseUnaryIO;
+
+ DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
+ DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+ const Index total_size = block_sizes.TotalSize();
+
+ // Create a random input tensors.
+ float* input_data = GenerateRandomData<float>(total_size);
+
+ float* output_data = new float[total_size];
+ UnaryFunctor functor;
+ TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+ strides, input_data);
+ for (int i = 0; i < total_size; ++i) {
+ VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+ }
+
+ delete[] input_data;
+ delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_zero_strides() {
+ typedef internal::scalar_square_op<float> UnaryFunctor;
+ typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+ Layout>
+ TensorBlockCwiseUnaryIO;
+
+ DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+ DSizes<Index, 5> input_sizes = rnd_dims;
+ input_sizes[0] = 1;
+ input_sizes[2] = 1;
+ input_sizes[4] = 1;
+
+ DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
+ input_strides[0] = 0;
+ input_strides[2] = 0;
+ input_strides[4] = 0;
+
+ // Generate random data.
+ float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
+
+ DSizes<Index, 5> output_sizes = rnd_dims;
+ DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
+
+ const Index output_total_size = output_sizes.TotalSize();
+ float* output_data = new float[output_total_size];
+
+ UnaryFunctor functor;
+ TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
+ output_data, input_strides, input_data);
+ for (int i = 0; i < rnd_dims[0]; ++i) {
+ for (int j = 0; j < rnd_dims[1]; ++j) {
+ for (int k = 0; k < rnd_dims[2]; ++k) {
+ for (int l = 0; l < rnd_dims[3]; ++l) {
+ for (int m = 0; m < rnd_dims[4]; ++m) {
+ Index output_index = i * output_strides[0] + j * output_strides[1] +
+ k * output_strides[2] + l * output_strides[3] +
+ m * output_strides[4];
+ Index input_index = i * input_strides[0] + j * input_strides[1] +
+ k * input_strides[2] + l * input_strides[3] +
+ m * input_strides[4];
+ VERIFY_IS_EQUAL(output_data[output_index],
+ functor(input_data[input_index]));
+ }
+ }
+ }
+ }
+ }
+
+ delete[] input_data;
+ delete[] output_data;
+}
+
+template <typename T, int NumDims, int Layout>
static void test_block_cwise_binary_io_basic() {
typedef internal::scalar_sum_op<T> BinaryFunctor;
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
@@ -986,6 +1094,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
TEST_LAYOUTS(test_block_io_zero_stride);
TEST_LAYOUTS(test_block_io_squeeze_ones);
+ TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
+ TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
+ TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 274f901ce..1bb99854c 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -18,22 +18,57 @@ using Eigen::RowMajor;
using Eigen::ColMajor;
// A set of tests to verify that different TensorExecutor strategies yields the
-// same results for all the ops, supporting tiled execution.
+// same results for all the ops, supporting tiled evaluation.
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+ array<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<int>(min_dim, max_dim);
+ }
+ return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_unary_expr(Device d) {
+ static constexpr int Options = 0 | Layout;
-template <typename Device, bool Vectorizable, bool Tileable, int Layout>
-static void test_execute_binary_expr(Device d) {
// Pick a large enough tensor size to bypass small tensor block evaluation
// optimization.
- int d0 = internal::random<int>(100, 200);
- int d1 = internal::random<int>(100, 200);
- int d2 = internal::random<int>(100, 200);
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> src(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
- static constexpr int Options = 0;
- using IndexType = int;
+ src.setRandom();
+ const auto expr = src.square();
- Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
- Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
- Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T square = src.coeff(i) * src.coeff(i);
+ VERIFY_IS_EQUAL(square, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> lhs(dims);
+ Tensor<T, NumDims, Options, Index> rhs(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
lhs.setRandom();
rhs.setRandom();
@@ -46,33 +81,389 @@ static void test_execute_binary_expr(Device d) {
Executor::run(Assign(dst, expr), d);
- for (int i = 0; i < d0; ++i) {
- for (int j = 0; j < d1; ++j) {
- for (int k = 0; k < d2; ++k) {
- float sum = lhs(i, j, k) + rhs(i, j, k);
- VERIFY_IS_EQUAL(sum, dst(i, j, k));
- }
- }
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T sum = lhs.coeff(i) + rhs.coeff(i);
+ VERIFY_IS_EQUAL(sum, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto broadcasts = RandomDims<NumDims>(1, 7);
+ const auto expr = src.broadcast(broadcasts);
+
+ // We assume that broadcasting on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_chipping_rvalue(Device d) {
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Layout, Index> src(dims);
+ src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ const auto expr = src.template chip<(CHIP_DIM)>(offset); \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> golden; \
+ golden = expr; \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
+ \
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tileable>; \
+ \
+ Executor::run(Assign(dst, expr), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_chipping_lvalue(Device d) {
+ auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ /* Generate random data that we'll assign to the chipped tensor dim. */ \
+ array<Index, NumDims - 1> src_dims; \
+ for (int i = 0; i < NumDims - 1; ++i) { \
+ int dim = i < (CHIP_DIM) ? i : i + 1; \
+ src_dims[i] = dims[dim]; \
+ } \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
+ src.setRandom(); \
+ \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ \
+ /* Generate random data to fill non-chipped dimensions*/ \
+ Tensor<T, NumDims, Layout, Index> random(dims); \
+ random.setRandom(); \
+ \
+ Tensor<T, NumDims, Layout, Index> golden(dims); \
+ golden = random; \
+ golden.template chip<(CHIP_DIM)>(offset) = src; \
+ \
+ Tensor<T, NumDims, Layout, Index> dst(dims); \
+ dst = random; \
+ auto expr = dst.template chip<(CHIP_DIM)>(offset); \
+ \
+ using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tileable>; \
+ \
+ Executor::run(Assign(expr, src), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_shuffle_rvalue(Device d) {
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Create a random dimension re-ordering/shuffle.
+ std::vector<Index> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+ const auto expr = src.shuffle(shuffle);
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_shuffle_lvalue(Device d) {
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Create a random dimension re-ordering/shuffle.
+ std::vector<Index> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+ array<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+ golden.shuffle(shuffle) = src;
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+
+ auto expr = dst.shuffle(shuffle);
+
+ using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(expr, src), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_reduction(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+ static constexpr int ReducedDims = NumDims - 2;
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick two random and unique reduction dimensions.
+ int reduction0 = internal::random<int>(0, NumDims - 1);
+ int reduction1 = internal::random<int>(0, NumDims - 1);
+ while (reduction0 == reduction1) {
+ reduction1 = internal::random<int>(0, NumDims - 1);
+ }
+
+ DSizes<Index, 2> reduction_axis;
+ reduction_axis[0] = reduction0;
+ reduction_axis[1] = reduction1;
+
+ Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
+
+ // Now do the reduction using configured tensor executor.
+ Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.sum(reduction_axis);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_reshape(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+ static constexpr int ReshapedDims = NumDims - 1;
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Multiple 0th dimension and then shuffle.
+ std::vector<Index> shuffle;
+ for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+ DSizes<Index, ReshapedDims> reshaped_dims;
+ reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+ for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+ Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.reshape(reshaped_dims);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
}
}
-#define CALL_SUBTEST_COMBINATIONS(NAME) \
- CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
- CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
- CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> golden =
+ src.slice(slice_start, slice_size);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> slice(slice_size);
+ slice.setRandom();
+
+ // Asign a slice using default executor.
+ Tensor<T, NumDims, Options, Index> golden = src;
+ golden.slice(slice_start, slice_size) = slice;
+
+ // And using configured execution strategy.
+ Tensor<T, NumDims, Options, Index> dst = src;
+ auto expr = dst.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(expr, slice), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+#define CALL_SUBTEST_COMBINATIONS(NAME, T, NUM_DIMS) \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, false, ColMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, true, ColMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, false, RowMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice, true, true, RowMajor>(default_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
+ CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
@@ -81,7 +472,53 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::ThreadPool tp(num_threads);
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
- CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
+ CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 2);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 2);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 5);
}
#undef CALL_SUBTEST_COMBINATIONS
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index ab19b6e6b..062dd1c0f 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
Tensor<float, 4, DataLayout> expected;
expected = tensor.shuffle(shuffles);
- Tensor<float, 4, DataLayout> result(5,7,3,2);
+ Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
- array<int, 4> src_slice_dim{{2,3,1,7}};
- array<int, 4> src_slice_start{{0,0,0,0}};
- array<int, 4> dst_slice_dim{{1,7,3,2}};
- array<int, 4> dst_slice_start{{0,0,0,0}};
+ array<ptrdiff_t, 4> src_slice_dim({2, 3, 1, 7});
+ array<ptrdiff_t, 4> src_slice_start({0, 0, 0, 0});
+ array<ptrdiff_t, 4> dst_slice_dim({1, 7, 3, 2});
+ array<ptrdiff_t, 4> dst_slice_start({0, 0, 0, 0});
for (int i = 0; i < 5; ++i) {
result.slice(dst_slice_start, dst_slice_dim) =