aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-06-28 10:08:23 +0100
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-06-28 10:08:23 +0100
commit7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde (patch)
treefbff4d80b6b373dcd53632de4c1fab5c393bdd64
parent16a56b2dddbfaf2d4b81d62be5e3139f12783ac8 (diff)
[SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL.
* Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes.
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h49
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h19
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h33
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h38
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h25
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h49
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h32
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h54
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h1155
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h34
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h228
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h133
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h19
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h58
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h22
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h45
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h35
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMap.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h108
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h94
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h26
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h74
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h153
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h11
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h31
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h27
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h36
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h42
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h51
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp22
-rw-r--r--unsupported/test/cxx11_tensor_morphing.cpp3
47 files changed, 1979 insertions, 947 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 6f7c6d86d..05e7963f0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
template<typename XprType>
struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
{
- typedef const TensorIndexTupleOp<XprType>& type;
+ typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
};
template<typename XprType>
@@ -82,6 +82,8 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
static const int NumDims = internal::array_size<Dimensions>::value;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@@ -100,7 +102,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
return m_impl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -118,11 +120,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
- return m_impl;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
}
#endif
@@ -154,7 +156,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
template<typename ReduceOp, typename Dims, typename XprType>
struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
{
- typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+ typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
};
template<typename ReduceOp, typename Dims, typename XprType>
@@ -216,6 +218,9 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
static const int NumDims = internal::array_size<InputDimensions>::value;
typedef array<Index, NumDims> StrideDims;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<TupleType, Device> TupleStorageMem;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@@ -231,9 +236,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
: m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
m_return_dim(op.return_dim())
-#ifdef EIGEN_USE_SYCL
- ,m_device(device)
-#endif
{
gen_strides(m_orig_impl.dimensions(), m_strides);
if (Layout == static_cast<int>(ColMajor)) {
@@ -242,15 +244,18 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
} else {
const Index total_size = internal::array_prod(m_orig_impl.dimensions());
m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
- }
- m_stride_div = (m_return_dim >= 0) ? m_strides[m_return_dim] : 1;
+ }
+ // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+ m_stride_div = ((m_return_dim >= 0) &&
+ (m_return_dim < static_cast<Index>(m_strides.size())))
+ ? m_strides[m_return_dim] : 1;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_impl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -263,16 +268,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
}
- #ifndef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- #else // following functions are required by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}
- const Device& device() const{return m_device;}
- #endif
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_orig_impl.bind(cgh);
+ }
+#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
@@ -312,9 +314,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
StrideDims m_strides;
Index m_stride_mod;
Index m_stride_div;
-#ifdef EIGEN_USE_SYCL
- const Device& m_device;
-#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 06bf422c5..d6e51bc6c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -97,6 +97,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
static const int NumDims = XprType::NumDims;
@@ -136,7 +138,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
return m_rightImpl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
m_leftImpl.evalSubExprsIfNeeded(NULL);
// If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -154,6 +156,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
@@ -199,13 +202,15 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_leftImpl.writeBlock(*block);
}
}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+#endif
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
-
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_leftImpl.data(); }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
private:
TensorEvaluator<LeftArgType, Device> m_leftImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 0db637405..38c06aba2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -841,7 +841,7 @@ struct TensorBlockView {
const Scalar* data() const { return m_data; }
private:
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
Dimensions m_block_sizes, m_block_strides;
const Scalar* m_data; // Not owned.
Scalar* m_allocated_data; // Owned.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index c102a43fb..10bdbc6a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -37,7 +37,7 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
template<typename Broadcast, typename XprType>
struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
{
- typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+ typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
};
template<typename Broadcast, typename XprType>
@@ -105,7 +105,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
bool isCopy, nByOne, oneByN;
+ public:
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
@@ -205,7 +209,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -238,6 +242,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// TODO: attempt to speed this up. The integer divisions and modulo are slow
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -272,6 +277,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -376,6 +382,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
values[0] = m_impl.coeff(inputIndex);
return internal::pload1<PacketReturnType>(values);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
if (outputOffset + cur < m_outputStrides[endDim]) {
values[i] = m_impl.coeff(inputIndex);
@@ -410,6 +417,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
return m_impl.template packet<Unaligned>(inputIndex);
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
if (inputIndex > m_inputStrides[dim]-1) {
inputIndex = 0;
@@ -441,6 +449,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
values[0] = m_impl.coeff(inputIndex);
return internal::pload1<PacketReturnType>(values);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
if (outputOffset + cur < m_outputStrides[dim]) {
values[i] = m_impl.coeff(inputIndex);
@@ -465,6 +474,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
const Index originalIndex = index;
Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -500,6 +510,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize; ++i) {
if (innermostLoc + i < m_impl.dimensions()[0]) {
values[i] = m_impl.coeff(inputIndex+i);
@@ -521,6 +532,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
const Index originalIndex = index;
Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -556,6 +568,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize; ++i) {
if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
values[i] = m_impl.coeff(inputIndex+i);
@@ -572,6 +585,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
costPerCoeff(bool vectorized) const {
double compute_cost = TensorOpCost::AddCost<Index>();
if (!isCopy && NumDims > 0) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
compute_cost += TensorOpCost::DivCost<Index>();
if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -845,12 +859,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
Broadcast functor() const { return m_broadcast; }
-
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+ #endif
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
const Dimensions& input_block_sizes,
@@ -874,9 +893,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
}
- protected:
- const Device& m_device;
- const Broadcast m_broadcast;
+protected:
+ const Device EIGEN_DEVICE_REF m_device;
+ const typename internal::remove_reference<Broadcast>::type m_broadcast;
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 0b5d4127b..7afaf0f33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -38,7 +38,7 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
template<DenseIndex DimId, typename XprType>
struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
{
- typedef const TensorChippingOp<DimId, XprType>& type;
+ typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
};
template<DenseIndex DimId, typename XprType>
@@ -139,7 +139,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -169,7 +170,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
OutputTensorBlock;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
+ : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
@@ -218,7 +219,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -243,6 +244,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = m_impl.coeff(inputIndex);
inputIndex += m_inputStride;
@@ -262,6 +264,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
} else {
// Cross the stride boundary. Fallback to slow path.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index);
++index;
@@ -349,26 +352,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
m_impl.block(&input_block);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
- CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+ typename Storage::Type result = constCast(m_impl.data());
if (IsOuterChipping && result) {
return result + m_inputOffset;
} else {
return NULL;
}
}
-
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
- return m_dim.actualDim();
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
}
-
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
- return m_offset;
- }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
@@ -399,10 +396,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
DSizes<Index, NumInputDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
- const Device& m_device;
-// required by sycl
- const DenseIndex m_offset;
-
+ const Device EIGEN_DEVICE_REF m_device;
};
@@ -466,6 +460,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->m_impl.coeffRef(inputIndex) = values[i];
inputIndex += this->m_inputStride;
@@ -484,6 +479,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
// Cross stride boundary. Fallback to slow path.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index) = values[i];
++index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 3863ee8c3..292a1bae1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -119,6 +119,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
@@ -181,7 +183,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
// TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
{
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
@@ -219,11 +221,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
Index left_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
left_index = subs[0];
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
} else {
left_index = subs[NumDims - 1];
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
@@ -235,11 +239,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
Index right_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
right_index = subs[0];
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
} else {
right_index = subs[NumDims - 1];
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
@@ -257,6 +263,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
@@ -279,13 +286,15 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
TensorOpCost(0, 0, compute_cost);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
- /// required by sycl in order to extract the accessor
- const Axis& axis() const { return m_axis; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+ #endif
protected:
Dimensions m_dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 4ddcd982e..de7c2248a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -433,6 +433,8 @@ struct TensorContractionEvaluatorBase
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
@@ -452,6 +454,9 @@ struct TensorContractionEvaluatorBase
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
@@ -653,14 +658,14 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
evalTo(data);
return false;
} else {
- m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+ m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
evalTo(m_result);
return true;
}
@@ -934,7 +939,7 @@ struct TensorContractionEvaluatorBase
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
@@ -1169,9 +1174,9 @@ protected:
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
OutputKernelType m_output_kernel;
- Scalar* m_result;
+ EvaluatorPointerType m_result;
bool m_can_use_xsmm;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 1be823fd1..50865d404 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -59,6 +59,13 @@ struct CoeffLoader {
return m_tensor.template packet<LoadMode>(index);
}
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_tensor.bind(cgh);
+ }
+ #endif
+
private:
const Tensor m_tensor;
};
@@ -87,6 +94,13 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
{
return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
}
+
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+ #endif
private:
typedef typename Tensor::Scalar Scalar;
@@ -139,6 +153,7 @@ class SimpleTensorContractionMapper {
EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
Index nocontract_val = left ? row : col;
Index linidx = 0;
+ EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
const Index idx = nocontract_val / m_ij_strides[i];
linidx += idx * m_nocontract_strides[i];
@@ -155,6 +170,7 @@ class SimpleTensorContractionMapper {
Index contract_val = left ? col : row;
if(array_size<contract_t>::value > 0) {
+ EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
const Index idx = contract_val / m_k_strides[i];
linidx += idx * m_contract_strides[i];
@@ -179,6 +195,7 @@ class SimpleTensorContractionMapper {
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
const Index idx0 = nocontract_val[0] / m_ij_strides[i];
const Index idx1 = nocontract_val[1] / m_ij_strides[i];
@@ -199,6 +216,7 @@ class SimpleTensorContractionMapper {
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
if (array_size<contract_t>::value> 0) {
+ EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
const Index idx0 = contract_val[0] / m_k_strides[i];
const Index idx1 = contract_val[1] / m_k_strides[i];
@@ -230,6 +248,13 @@ class SimpleTensorContractionMapper {
return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_tensor.bind(cgh);
+ }
+ #endif
+
const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
return m_tensor;
}
@@ -302,6 +327,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
EIGEN_ALIGN_MAX Scalar data[packet_size];
data[0] = this->m_tensor.coeff(first);
+ EIGEN_UNROLL_LOOP
for (Index k = 1; k < packet_size - 1; k += 2) {
const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
data[k] = this->m_tensor.coeff(internal_pair.first);
@@ -472,6 +498,13 @@ class TensorContractionSubMapper {
return false;
}
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_base_mapper.bind(cgh);
+ }
+ #endif
+
const ParentMapper& base_mapper() const { return m_base_mapper; }
Index vert_offset() const { return m_vert_offset; }
Index horiz_offset() const { return m_horiz_offset; }
@@ -515,6 +548,22 @@ class TensorContractionInputMapper
};
+template <typename T> struct TensorContractionInputMapperTrait;
+
+template<typename Scalar_, typename Index_, int side_,
+ typename Tensor_,
+ typename nocontract_t_, typename contract_t_,
+ int packet_size_,
+ bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_, template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_,
+ nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_,
+ inner_dim_reordered_, Alignment_, MakePointer_> > {
+
+ typedef Tensor_ XprType;
+ static const bool inner_dim_contiguous = inner_dim_contiguous_;
+ static const bool inner_dim_reordered = inner_dim_reordered_;
+ };
+
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 938fd0f34..e96f31537 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -129,6 +129,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
internal::scalar_cast_op<SrcType, TgtType> converter;
EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < TgtPacketSize; ++i) {
values[i] = converter(m_impl.coeff(index+i));
}
@@ -164,15 +165,15 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
typename XprType::Nested m_xpr;
};
-template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
impl.evalSubExprsIfNeeded(NULL);
return true;
}
};
-template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
return impl.evalSubExprsIfNeeded(data);
}
};
@@ -207,6 +208,7 @@ struct PacketConv {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
internal::scalar_cast_op<SrcType, TargetType> converter;
EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = converter(impl.coeff(index+i));
}
@@ -267,10 +269,18 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
- PacketAccess = true,
+ PacketAccess =
+ #ifndef EIGEN_USE_SYCL
+ true,
+ #else
+ TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+ #endif
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -284,9 +294,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
{
- return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+ return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
@@ -330,10 +340,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
}
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
/// required by sycl in order to extract the sycl accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 2d0e6599f..25e1e5896 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -303,6 +303,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
@@ -469,7 +471,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
PacketSize));
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -525,7 +527,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
@@ -548,7 +550,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
KernelArgType m_kernelArg;
const Scalar* m_kernel;
bool m_local_kernel;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 6ee3827f3..723d2b082 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -36,7 +36,7 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
template<typename CustomUnaryFunc, typename XprType>
struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
{
- typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+ typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
};
template<typename CustomUnaryFunc, typename XprType>
@@ -88,7 +88,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
- typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -108,20 +110,20 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (data) {
evalTo(data);
return false;
} else {
- m_result = static_cast<PointerT>(
- m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+ m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+ m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
evalTo(m_result);
return true;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- if (m_result != NULL) {
+ if (m_result) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
@@ -141,22 +143,25 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_result.bind(cgh);
+ }
#endif
protected:
- EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
- TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(data, m_dimensions);
+ EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
+ TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
m_op.func().eval(m_op.expression(), result, m_device);
}
Dimensions m_dimensions;
const ArgType m_op;
- const Device& m_device;
- PointerT m_result;
+ const Device EIGEN_DEVICE_REF m_device;
+ EvaluatorPointerType m_result;
};
@@ -251,7 +256,10 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
- typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;
+
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -271,12 +279,13 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (data) {
evalTo(data);
return false;
} else {
- m_result = static_cast<PointerT>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)));
+ m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+ m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
evalTo(m_result);
return true;
}
@@ -303,22 +312,25 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_result.bind(cgh);
+ }
#endif
protected:
- EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
- TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(data, m_dimensions);
+ EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
+ TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
}
Dimensions m_dimensions;
const XprType m_op;
- const Device& m_device;
- PointerT m_result;
+ const Device EIGEN_DEVICE_REF m_device;
+ EvaluatorPointerType m_result;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 8cb95f731..46b9d3ab2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -39,6 +39,10 @@ struct DefaultDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#if !defined(EIGEN_GPU_COMPILE_PHASE)
@@ -54,7 +58,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
// Running on the host CPU
return l1CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -67,7 +71,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
// Running single threaded on the host CPU
return l3CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index 83cde6afb..ebf85c072 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -215,6 +215,10 @@ struct GpuDevice {
stream_->deallocate(buffer);
}
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
EIGEN_STRONG_INLINE void* scratchpad() const {
return stream_->scratchpad();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e7beb2c82..93efe2f82 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,498 +14,879 @@
#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
-template<size_t Align> struct CheckAlignStatically {
- static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *)));
-};
-template <bool IsAligned, size_t Align>
-struct Conditional_Allocate {
+#include <unordered_set>
- EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) {
- return aligned_alloc(Align, elements);
- }
-};
-template <size_t Align>
-struct Conditional_Allocate<false, Align> {
+namespace Eigen {
- EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){
- return malloc(elements);
- }
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+ SyclDeviceInfo(cl::sycl::queue queue)
+ : local_mem_type(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::local_mem_type>()),
+ max_work_item_sizes(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_work_item_sizes>()),
+ max_mem_alloc_size(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_mem_alloc_size>()),
+ max_compute_units(queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_compute_units>()),
+ max_work_group_size(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_work_group_size>()),
+ local_mem_size(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::local_mem_size>()),
+ platform_name(queue.get_device()
+ .get_platform()
+ .template get_info<cl::sycl::info::platform::name>()),
+ device_name(queue.get_device()
+ .template get_info<cl::sycl::info::device::name>()),
+ device_vendor(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::vendor>()) {}
+
+ cl::sycl::info::local_mem_type local_mem_type;
+ cl::sycl::id<3> max_work_item_sizes;
+ unsigned long max_mem_alloc_size;
+ unsigned long max_compute_units;
+ unsigned long max_work_group_size;
+ size_t local_mem_size;
+ std::string platform_name;
+ std::string device_name;
+ std::string device_vendor;
};
-template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>>
-struct SyclAllocator {
- typedef Scalar value_type;
- typedef typename std::allocator_traits<Allocator>::pointer pointer;
- typedef typename std::allocator_traits<Allocator>::size_type size_type;
- SyclAllocator( ){};
- Scalar* allocate(std::size_t elements) {
- return static_cast<Scalar*>(Conditional_Allocate<CheckAlignStatically<Align>::Val, Align>::conditional_allocate(elements));
+} // end namespace internal
+} // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+ -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+ return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+ std::vector<cl::sycl::device> supported_devices;
+ auto platform_list = cl::sycl::platform::get_platforms();
+ for (const auto &platform : platform_list) {
+ auto device_list = platform.get_devices();
+ auto platform_name =
+ platform.template get_info<cl::sycl::info::platform::name>();
+ std::transform(platform_name.begin(), platform_name.end(),
+ platform_name.begin(), ::tolower);
+ for (const auto &device : device_list) {
+ auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+ std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+ bool unsupported_condition =
+ (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+ vendor.find("apu") == std::string::npos) ||
+ (platform_name.find("experimental") != std::string::npos) ||
+ device.is_host();
+ if (!unsupported_condition) {
+ supported_devices.push_back(device);
+ }
+ }
}
- void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); }
-};
-
-namespace Eigen {
+ return supported_devices;
+#endif
+}
-#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast<Scalar*>(static_cast<void*>(((buf_acc.get_pointer().get()))))
-#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
+class QueueInterface {
+ public:
+ /// Creating device by using cl::sycl::selector or cl::sycl::device.
+ template <typename DeviceOrSelector>
+ explicit QueueInterface(
+ const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+ unsigned num_threads = std::thread::hardware_concurrency())
+ : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+ m_thread_pool(num_threads),
+ m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ m_prog.build_with_kernel_type<DeviceOrSelector>();
+ auto f = [&](cl::sycl::handler &cgh) {
+ cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+ [=]() {})
+ };
+ EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+ }
+ template <typename DeviceOrSelector>
+ explicit QueueInterface(
+ const DeviceOrSelector &dev_or_sel,
+ unsigned num_threads = std::thread::hardware_concurrency())
+ : QueueInterface(dev_or_sel,
+ [this](cl::sycl::exception_list l) {
+ this->exception_caught_ = this->sycl_async_handler(l);
+ },
+ num_threads) {}
- template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
- public:
- MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
- auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
- auto globalid = itemID.get_global_linear_id();
- if (globalid < m_rng) {
- dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
- }
- }
+ /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+ EIGEN_STRONG_INLINE void *attach_buffer(
+ cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return static_cast<void *>(pMapper.add_pointer(buf));
+ }
- private:
- read_accessor m_src_acc;
- write_accessor m_dst_acc;
- size_t m_rng;
- size_t m_i;
- size_t m_offset;
- };
-
-template<typename AccType>
- struct memsetkernelFunctor{
- AccType m_acc;
- const ptrdiff_t buff_offset;
- const size_t m_rng, m_c;
- memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto globalid=itemID.get_global_linear_id();
- if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
- }
-
- };
-
-struct memsetCghFunctor{
- cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf;
- const ptrdiff_t& buff_offset;
- const size_t& rng , GRange, tileSize;
- const int &c;
- memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
- :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
- void operator()(cl::sycl::handler &cgh) const {
- auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(buf_acc) AccType;
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c));
+ /// Detach previously attached buffer
+ EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfree<false>(p, pMapper);
}
-};
-//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU)
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
-std::vector<cl::sycl::device> supported_devices;
-auto plafrom_list =cl::sycl::platform::get_platforms();
-for(const auto& platform : plafrom_list){
- auto device_list = platform.get_devices();
- auto platform_name =platform.template get_info<cl::sycl::info::platform::name>();
- std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
- for(const auto& device : device_list){
- auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
- std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
- bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) ||
- (device.is_gpu() && platform_name.find("intel")!=std::string::npos);
- if(!unsuported_condition){
- std::cout << "Platform name "<< platform_name << std::endl;
- supported_devices.push_back(device);
+ /// Allocating device pointer. This pointer is actually an 8 bytes host
+ /// pointer used as key to access the sycl device buffer. The reason is that
+ /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+ /// expressions. So we create a key pointer to be used in Eigen expression
+ /// construction. When we convert the Eigen construction into the sycl
+ /// construction we use this pointer as a key in our buffer_map and we make
+ /// sure that we dedicate only one buffer only for this pointer. The device
+ /// pointer would be deleted by calling deallocate function.
+ EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+ size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+ if (align > 0) {
+ num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
}
+#endif
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
-}
-return supported_devices;
-}
-class QueueInterface {
-public:
- /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
- /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
- template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
-#ifdef EIGEN_EXCEPTIONS
- m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
- for (const auto& e : l) {
- try {
- if (e) {
- exception_caught_ = true;
- std::rethrow_exception(e);
+ EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+ size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+ if (align > 0) {
+ num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+ }
+#endif
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ if (scratch_buffers.empty()) {
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+ ;
+ } else {
+ for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+ auto buff = pMapper.get_buffer(*it);
+ if (buff.get_size() >= num_bytes) {
+ auto ptr = *it;
+ scratch_buffers.erase(it);
+ return ptr;
+ } else {
+ ++it;
}
- } catch (cl::sycl::exception e) {
- std::cerr << e.what() << std::endl;
}
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
- }))
#else
-m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
- for (const auto& e : l) {
- if (e) {
- exception_caught_ = true;
- std::cerr << "Error detected Inside Sycl Device."<< std::endl;
-
- }
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+#endif
+ }
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write, data_t>
+ get(data_t *data) const {
+ return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
}
-}))
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+ TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+ data_t>
+ data) const {
+ return static_cast<data_t *>(data.get_virtual_pointer());
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.insert(p);
+#else
+ TensorSycl::internal::SYCLfree(p, pMapper);
#endif
- {}
-
- /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
- /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
- /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we
- /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
- /// The device pointer would be deleted by calling deallocate function.
- EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes));
- auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
- buf.set_final_data(nullptr);
- buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf));
- return static_cast<void*>(ptr);
+ }
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE void deallocate_temp(
+ const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+ deallocate_temp(p.get_virtual_pointer());
}
/// This is used to deallocate the device pointer. p is used as a key inside
/// the map to find the device buffer and delete it.
EIGEN_STRONG_INLINE void deallocate(void *p) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it = buffer_map.find(static_cast<const uint8_t*>(p));
- if (it != buffer_map.end()) {
- buffer_map.erase(it);
- }
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfree(p, pMapper);
}
EIGEN_STRONG_INLINE void deallocate_all() const {
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.clear();
- }
- /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
- /// pointer created as a key we find the sycl buffer and get the host accessor with write mode
- /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that
- /// this buffer is accessed, the data will be copied to the device.
- /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark
- /// Also, this is faster as it uses the map_allocator instead of memcpy
- template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
- auto it =find_buffer(dst);
- auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first;
- offset/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n));
- m_queue.submit([&](cl::sycl::handler &cgh) {
- auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0));
- });
- synchronize();
- }
- /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
- /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
- /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
- /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
- /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
- /// to the cpu only once per function call.
- template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
- auto it =find_buffer(src);
- auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
- offset/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
- m_queue.submit([&](cl::sycl::handler &cgh) {
- auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
- });
- synchronize();
- }
-
- /// the memcpy function
- template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
- auto it1 = find_buffer(static_cast<const void*>(src));
- auto it2 = find_buffer(dst);
- auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
- auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
- offset/=sizeof(Index);
- i/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- m_queue.submit([&](cl::sycl::handler &cgh) {
- auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
- });
- synchronize();
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.clear();
+#endif
}
+ /// The memcpyHostToDevice is used to copy the data from host to device
+ /// The destination pointer could be deleted before the copy happend which is
+ /// why a callback function is needed. By default if none is provided, the
+ /// function is blocking.
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(
+ void *dst, const void *src, size_t n,
+ std::function<void()> callback) const {
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+ write_accessor;
+ if (n == 0) {
+ if (callback) callback();
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+ buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+ auto non_deleter = [](buffer_scalar_t const *) {};
+ std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+ cgh.copy(s_ptr, dst_acc);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ synchronize_and_callback(e, callback);
+ }
+
+ /// The memcpyDeviceToHost is used to copy the data from device to host.
+ /// The source pointer could be deleted before the copy happend which is
+ /// why a callback function is needed. By default if none is provided, the
+ /// function is blocking.
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+ void *dst, const void *src, size_t n,
+ std::function<void()> callback) const {
+ static const auto read_mode = cl::sycl::access::mode::read;
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+ read_accessor;
+ if (n == 0) {
+ if (callback) callback();
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+ buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+ auto non_deleter = [](buffer_scalar_t *) {};
+ std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+ cgh.copy(src_acc, s_ptr);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ synchronize_and_callback(e, callback);
+ }
+
+ /// The memcpy function.
+ /// No callback is required here as both arguments are on the device
+ /// and SYCL can handle the dependency.
+ EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+ static const auto read_mode = cl::sycl::access::mode::read;
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ if (n == 0) {
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+ auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+ cgh.copy(src_acc, dst_acc);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ async_synchronize(e);
+ }
+
+ /// the memset function.
+ /// No callback is required here as both arguments are on the device
+ /// and SYCL can handle the dependency.
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
- size_t rng, GRange, tileSize;
- parallel_for_setup(n, tileSize, rng, GRange);
- auto it1 = find_buffer(static_cast<const void*>(data));
- ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
- m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
- synchronize();
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ if (n == 0) {
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+ // The cast to uint8_t is here to match the behaviour of the standard
+ // memset. The cast to buffer_scalar_t is needed to match the type of the
+ // accessor (in case buffer_scalar_t is not uint8_t)
+ cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ async_synchronize(e);
+ }
+
+ /// Get a range accessor to the virtual pointer's device memory. This range
+ /// accessor will allow access to the memory from the pointer to the end of
+ /// the buffer.
+ ///
+ /// NOTE: Inside a kernel the range accessor will always be indexed from the
+ /// start of the buffer, so the offset in the accessor is only used by
+ /// methods like handler::copy and will not be available inside a kernel.
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+ get_range_accessor(const void *ptr) const {
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+ typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+ typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+ auto original_buffer = pMapper.get_buffer(ptr);
+ const ptrdiff_t offset = pMapper.get_offset(ptr);
+ const ptrdiff_t typed_offset = offset / sizeof(T);
+ eigen_assert(typed_offset >= 0);
+ const auto typed_size = original_buffer.get_size() / sizeof(T);
+ auto buffer = original_buffer.template reinterpret<
+ typename Eigen::internal::remove_const<T>::type>(
+ cl::sycl::range<1>(typed_size));
+ const ptrdiff_t size = buffer.get_count() - typed_offset;
+ eigen_assert(size >= 0);
+ typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+ 1, AcMd, global_access, is_place_holder>
+ placeholder_accessor_t;
+ const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+ return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+ cl::sycl::id<1>(typed_offset)),
+ static_cast<size_t>(typed_offset),
+ reinterpret_cast<std::intptr_t>(start_ptr));
+ }
+
+ /// Get a range accessor to the virtual pointer's device memory with a
+ /// specified size.
+ template <cl::sycl::access::mode AcMd, typename Index>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+ const Index n_bytes) const {
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ eigen_assert(n_bytes >= 0);
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ auto buffer = pMapper.get_buffer(ptr);
+ const ptrdiff_t offset = pMapper.get_offset(ptr);
+ eigen_assert(offset >= 0);
+ eigen_assert(offset + n_bytes <= buffer.get_size());
+ return buffer.template get_access<AcMd, global_access>(
+ cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
}
/// Creation of sycl accessor for a buffer. This function first tries to find
- /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
- /// the function then adds an entry by creating a sycl buffer for that particular pointer.
- template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
- get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
- return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+ /// the buffer in the buffer_map. If found it gets the accessor from it, if
+ /// not, the function then adds an entry by creating a sycl buffer for that
+ /// particular pointer.
+ template <cl::sycl::access::mode AcMd>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_buffer(ptr)
+ .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+ cgh);
}
- /// Accessing the created sycl device buffer for the device pointer
- EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
- return find_buffer(ptr)->second;
+ EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+ const void *ptr) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_buffer(ptr);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
- return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first);
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_offset(ptr);
}
EIGEN_STRONG_INLINE void synchronize() const {
- m_queue.wait_and_throw(); //pass
+#ifdef EIGEN_EXCEPTIONS
+ m_queue.wait_and_throw();
+#else
+ m_queue.wait();
+#endif
}
- EIGEN_STRONG_INLINE void asynchronousExec() const {
- ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
- //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled
- m_queue.wait_and_throw(); //pass
+ EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+ set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+ synchronize();
+#endif
}
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
- auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
- tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
- }
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+ Index &rng, Index &GRange) const {
+ tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(tileSize));
rng = n;
- if (rng==0) rng=static_cast<Index>(1);
- GRange=rng;
- if (tileSize>GRange) tileSize=GRange;
- else if(GRange>tileSize){
- Index xMode = static_cast<Index>(GRange % tileSize);
+ if (rng == 0) rng = static_cast<Index>(1);
+ GRange = rng;
+ if (tileSize > GRange)
+ tileSize = GRange;
+ else if (GRange > tileSize) {
+ Index xMode = static_cast<Index>(GRange % tileSize);
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
}
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+ Index &tileSize0,
+ Index &tileSize1, Index &rng0,
+ Index &rng1, Index &GRange0,
+ Index &GRange1) const {
+ Index max_workgroup_Size =
+ static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ max_workgroup_Size =
+ std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
+ tileSize1 =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+ rng1 = dim1;
+ if (rng1 == 0) rng1 = static_cast<Index>(1);
+ GRange1 = rng1;
+ if (tileSize1 > GRange1)
+ tileSize1 = GRange1;
+ else if (GRange1 > tileSize1) {
+ Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
- tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
+ tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
+ if (rng0 == 0) rng0 = static_cast<Index>(1);
+ GRange0 = rng0;
+ if (tileSize0 > GRange0)
+ tileSize0 = GRange0;
+ else if (GRange0 > tileSize0) {
+ Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+ Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+ Index &GRange1, Index &GRange2) const {
+ Index max_workgroup_Size =
+ static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ max_workgroup_Size =
+ std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
- rng2=dim2;
- if (rng2==0 ) rng1=static_cast<Index>(1);
- GRange2=rng2;
- if (tileSize2>GRange2) tileSize2=GRange2;
- else if(GRange2>tileSize2){
- Index xMode = static_cast<Index>(GRange2 % tileSize2);
+ tileSize2 =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+ rng2 = dim2;
+ if (rng2 == 0) rng1 = static_cast<Index>(1);
+ GRange2 = rng2;
+ if (tileSize2 > GRange2)
+ tileSize2 = GRange2;
+ else if (GRange2 > tileSize2) {
+ Index xMode = static_cast<Index>(GRange2 % tileSize2);
if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
}
- pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
+ pow_of_2 = static_cast<Index>(
+ std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
+ tileSize1 =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+ rng1 = dim1;
+ if (rng1 == 0) rng1 = static_cast<Index>(1);
+ GRange1 = rng1;
+ if (tileSize1 > GRange1)
+ tileSize1 = GRange1;
+ else if (GRange1 > tileSize1) {
+ Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
- tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
+ tileSize0 =
+ static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
+ if (rng0 == 0) rng0 = static_cast<Index>(1);
+ GRange0 = rng0;
+ if (tileSize0 > GRange0)
+ tileSize0 = GRange0;
+ else if (GRange0 > tileSize0) {
+ Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
}
+ EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ return true;
+#else
+ return m_device_info.local_mem_type ==
+ cl::sycl::info::local_mem_type::local;
+#endif
+ }
+
+ EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+ return m_device_info.max_mem_alloc_size;
+ }
+
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
- return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+ return m_device_info.max_compute_units;
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
- return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+ return m_device_info.max_work_group_size;
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+ return m_device_info.max_work_item_sizes;
}
/// No need for sycl it should act the same as CPU version
EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
- // OpenCL doesn't have such concept
+ // OpenCL doesnot have such concept
return 2;
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
- return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+ return m_device_info.local_mem_size;
}
- EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;}
+ // This function returns the nearest power of 2 Work-group size which is <=
+ // maximum device workgroup size.
+ EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+ return getPowerOfTwo(m_device_info.max_work_group_size, false);
+ }
+
+ EIGEN_STRONG_INLINE std::string getPlatformName() const {
+ return m_device_info.platform_name;
+ }
+
+ EIGEN_STRONG_INLINE std::string getDeviceName() const {
+ return m_device_info.device_name;
+ }
+
+ EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+ return m_device_info.device_vendor;
+ }
+
+ // This function returns the nearest power of 2
+ // if roundup is true returns result>=wgsize
+ // else it return result <= wgsize
+ EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+ if (roundUp) --wGSize;
+ wGSize |= (wGSize >> 1);
+ wGSize |= (wGSize >> 2);
+ wGSize |= (wGSize >> 4);
+ wGSize |= (wGSize >> 8);
+ wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+ wGSize |= (wGSize >> 32);
+#endif
+ return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
// This function checks if the runtime recorded an error for the
// underlying stream device.
EIGEN_STRONG_INLINE bool ok() const {
if (!exception_caught_) {
- m_queue.wait_and_throw();
+ synchronize();
}
return !exception_caught_;
}
+ EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ std::lock_guard<std::mutex> lock(event_mutex_);
+ return latest_events_[std::this_thread::get_id()];
+#else
+ eigen_assert(false);
+ return cl::sycl::event();
+#endif
+ }
+
// destructor
- ~QueueInterface() { buffer_map.clear(); }
+ ~QueueInterface() {
+ pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.clear();
+#endif
+ }
+
+ protected:
+ EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ std::lock_guard<std::mutex> lock(event_mutex_);
+ latest_events_[std::this_thread::get_id()] = e;
+#else
+ EIGEN_UNUSED_VARIABLE(e);
+#endif
+ }
+
+ void synchronize_and_callback(cl::sycl::event e,
+ const std::function<void()> &callback) const {
+ set_latest_event(e);
+ if (callback) {
+ auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+ cl::sycl::event(e).wait_and_throw();
+#else
+ cl::sycl::event(e).wait();
+#endif
+ callback();
+ };
+ m_thread_pool.Schedule(std::move(callback_));
+ } else {
+#ifdef EIGEN_EXCEPTIONS
+ m_queue.wait_and_throw();
+#else
+ m_queue.wait();
+#endif
+ }
+ }
+
+ bool sycl_async_handler(cl::sycl::exception_list l) const {
+ bool exception_caught = false;
+ for (const auto &e : l) {
+ if (e) {
+ exception_caught = true;
+#ifdef EIGEN_EXCEPTIONS
+ try {
+ std::rethrow_exception(e);
+ } catch (const cl::sycl::exception &e) {
+ std::cerr << e.what() << std::endl;
+ }
+#else
+ std::cerr << "Error detected inside Sycl device." << std::endl;
+ abort();
+#endif
+ }
+ }
+ return exception_caught;
+ }
-private:
/// class members:
bool exception_caught_ = false;
- mutable std::mutex mutex_;
+ mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ mutable std::mutex event_mutex_;
+ mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
/// std::map is the container used to make sure that we create only one buffer
- /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
- /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
- mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map;
+ /// per pointer. The lifespan of the buffer now depends on the lifespan of
+ /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+ /// host we should manually deallocate it.
+ mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ mutable std::unordered_set<void *> scratch_buffers;
+#endif
/// sycl queue
mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ mutable cl::sycl::program m_prog;
+#endif
- EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
- if (it1 != buffer_map.end()){
- return it1;
- }
- else{
- for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
- auto size = it->second.get_size();
- if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
- }
- }
- std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
- abort();
+ /// The thread pool is used to wait on events and call callbacks
+ /// asynchronously
+ mutable Eigen::ThreadPool m_thread_pool;
+
+ const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+ /// QueueInterface is not owned. it is the caller's responsibility to destroy
+ /// it
+ const QueueInterface *m_queue_stream;
+ explicit SyclDeviceBase(const QueueInterface *queue_stream)
+ : m_queue_stream(queue_stream) {}
+ EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+ return m_queue_stream;
}
};
-// Here is a sycl deviuce struct which accept the sycl queue interface
+// Here is a sycl device struct which accept the sycl queue interface
// as an input
-struct SyclDevice {
- // class member.
- QueueInterface* m_queue_stream;
- /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
- explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
+struct SyclDevice : public SyclDeviceBase {
+ explicit SyclDevice(const QueueInterface *queue_stream)
+ : SyclDeviceBase(queue_stream) {}
+
+ // this is the accessor used to construct the evaluator
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+ get_range_accessor(const void *ptr) const {
+ return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+ }
// get sycl accessor
- template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
- get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
- return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr);
+ template <cl::sycl::access::mode AcMd>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+ return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
}
/// Accessing the created sycl device buffer for the device pointer
- EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
- return m_queue_stream->get_sycl_buffer(ptr);
+ EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+ const void *ptr) const {
+ return queue_stream()->get_sycl_buffer(ptr);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+ Index &rng, Index &GRange) const {
+ queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+ Index &tileSize0,
+ Index &tileSize1, Index &rng0,
+ Index &rng1, Index &GRange0,
+ Index &GRange1) const {
+ queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
+ rng1, GRange0, GRange1);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+ Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+ Index &GRange1, Index &GRange2) const {
+ queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
+ tileSize2, rng0, rng1, rng2, GRange0,
+ GRange1, GRange2);
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange);
+ /// allocate device memory
+ EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+ return queue_stream()->allocate(num_bytes);
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
- m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1);
+ EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+ return queue_stream()->allocate_temp(num_bytes);
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
- m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2);
-
- }
- /// allocate device memory
- EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
- return m_queue_stream->allocate(num_bytes);
- }
/// deallocate device memory
EIGEN_STRONG_INLINE void deallocate(void *p) const {
- m_queue_stream->deallocate(p);
- }
+ queue_stream()->deallocate(p);
+ }
- // some runtime conditions that can be applied here
- EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+ EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+ queue_stream()->deallocate_temp(buffer);
+ }
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE void deallocate_temp(
+ const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+ queue_stream()->deallocate_temp(buffer);
+ }
+ EIGEN_STRONG_INLINE void deallocate_all() const {
+ queue_stream()->deallocate_all();
+ }
- /// the memcpy function
- template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
- m_queue_stream->memcpy(dst,src,n);
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write, data_t>
+ get(data_t *data) const {
+ return queue_stream()->get(data);
+ }
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+ TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+ data_t>
+ data) const {
+ return queue_stream()->get(data);
}
+ /// attach existing buffer
+ EIGEN_STRONG_INLINE void *attach_buffer(
+ cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+ return queue_stream()->attach_buffer(buf);
+ }
+ /// detach buffer
+ EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+ queue_stream()->detach_buffer(p);
+ }
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
- return m_queue_stream->get_offset(ptr);
+ return queue_stream()->get_offset(ptr);
+ }
+
+ // some runtime conditions that can be applied here
+ EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+ /// memcpyHostToDevice
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(
+ Index *dst, const Index *src, size_t n,
+ std::function<void()> callback = {}) const {
+ queue_stream()->memcpyHostToDevice(dst, src, n, callback);
}
-// memcpyHostToDevice
- template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
- m_queue_stream->memcpyHostToDevice(dst,src,n);
+ /// memcpyDeviceToHost
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+ void *dst, const Index *src, size_t n,
+ std::function<void()> callback = {}) const {
+ queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
}
-/// here is the memcpyDeviceToHost
- template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
- m_queue_stream->memcpyDeviceToHost(dst,src,n);
+ /// the memcpy function
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+ queue_stream()->memcpy(dst, src, n);
}
- /// Here is the implementation of memset function on sycl.
+ /// the memset function
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
- m_queue_stream->memset(data,c,n);
+ queue_stream()->memset(data, c, n);
}
/// returning the sycl queue
- EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();}
-
- EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
- // FIXME
- return 48*1024;
+ EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+ return queue_stream()->sycl_queue();
}
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+ return queue_stream()->program();
+ }
+#endif
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
@@ -513,40 +894,64 @@ struct SyclDevice {
return firstLevelCacheSize();
}
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
- return m_queue_stream->getNumSyclMultiProcessors();
+ return queue_stream()->getNumSyclMultiProcessors();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
- return m_queue_stream->maxSyclThreadsPerBlock();
+ return queue_stream()->maxSyclThreadsPerBlock();
+ }
+ EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+ return queue_stream()->maxWorkItemSizes();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
- // OpenCL doesn't have such concept
- return m_queue_stream->maxSyclThreadsPerMultiProcessor();
- // return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+ // OpenCL doesnot have such concept
+ return queue_stream()->maxSyclThreadsPerMultiProcessor();
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
- return m_queue_stream->sharedMemPerBlock();
+ return queue_stream()->sharedMemPerBlock();
+ }
+ EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+ return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+ }
+
+ EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+ return queue_stream()->getPowerOfTwo(val, roundUp);
}
/// No need for sycl it should act the same as CPU version
- EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); }
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return queue_stream()->majorDeviceVersion();
+ }
EIGEN_STRONG_INLINE void synchronize() const {
- m_queue_stream->synchronize(); //pass
+ queue_stream()->synchronize();
}
-
- EIGEN_STRONG_INLINE void asynchronousExec() const {
- m_queue_stream->asynchronousExec();
+ EIGEN_STRONG_INLINE void async_synchronize(
+ cl::sycl::event e = cl::sycl::event()) const {
+ queue_stream()->async_synchronize(e);
+ }
+ EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+ return queue_stream()->get_latest_event();
}
+
// This function checks if the runtime recorded an error for the
// underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- return m_queue_stream->ok();
+ EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+ EIGEN_STRONG_INLINE bool has_local_memory() const {
+ return queue_stream()->has_local_memory();
+ }
+ EIGEN_STRONG_INLINE long max_buffer_size() const {
+ return queue_stream()->max_buffer_size();
+ }
+ EIGEN_STRONG_INLINE std::string getPlatformName() const {
+ return queue_stream()->getPlatformName();
+ }
+ EIGEN_STRONG_INLINE std::string getDeviceName() const {
+ return queue_stream()->getDeviceName();
+ }
+ EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+ return queue_stream()->getDeviceVendor();
}
};
-// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
-// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
-// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
-struct SyclKernelDevice:DefaultDevice{};
-
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index b43db40c8..ef22a268a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -75,6 +75,11 @@ struct ThreadPoolDevice {
EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
deallocate(buffer);
}
+
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifdef __ANDROID__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 554ee5f59..576a4f3ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -42,7 +42,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
};
@@ -103,7 +102,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
@@ -115,22 +116,16 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_device(device),
- m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
- { }
+ : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
- // Used for accessor extraction in SYCL Managed TensorMap:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
- return m_op;
- }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
- typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
EIGEN_UNUSED_VARIABLE(scalar);
eigen_assert(scalar == NULL);
return m_impl.evalSubExprsIfNeeded(m_buffer);
@@ -165,19 +160,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
ArgType expression() const { return m_expression; }
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_buffer.bind(cgh);
+ }
+ #endif
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- /// added for sycl in order to construct the buffer from the sycl device
- const Device& device() const{return m_device;}
private:
TensorEvaluator<ArgType, Device> m_impl;
- const Device& m_device;
- DevicePointer m_buffer;
- const XprType& m_op;
+ EvaluatorPointerType m_buffer;
const ArgType m_expression;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 7f0f4acbc..1d48b5eed 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -34,6 +34,9 @@ struct TensorEvaluator
typedef typename Derived::Dimensions Dimensions;
typedef Derived XprType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@@ -60,16 +63,17 @@ struct TensorEvaluator
TensorBlockWriter;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
- : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
+ : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
+ m_dims(m.dimensions()),
+ m_device(device)
{ }
- // Used for accessor extraction in SYCL Managed TensorMap:
- const Derived& derived() const { return m_impl; }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
- m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+ m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
return false;
}
return true;
@@ -78,14 +82,12 @@ struct TensorEvaluator
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
- eigen_assert(m_data);
+ eigen_assert(m_data != NULL);
return m_data[index];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
- coeffRef(Index index) {
- eigen_assert(m_data);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+ eigen_assert(m_data != NULL);
return m_data[index];
}
@@ -114,7 +116,7 @@ struct TensorEvaluator
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
- eigen_assert(m_data);
+ eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
@@ -122,10 +124,9 @@ struct TensorEvaluator
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
coeffRef(const array<DenseIndex, NumCoords>& coords) {
- eigen_assert(m_data);
+ eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
@@ -152,16 +153,18 @@ struct TensorEvaluator
TensorBlockWriter::Run(block, m_data);
}
- EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
-
- /// required by sycl in order to construct sycl buffer from raw pointer
- const Device& device() const{return m_device;}
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
protected:
- typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
+ EvaluatorPointerType m_data;
Dimensions m_dims;
- const Device& m_device;
- const Derived& m_impl;
+ const Device m_device;
};
namespace {
@@ -184,6 +187,13 @@ Eigen::half loadConstant(const Eigen::half* address) {
return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
}
#endif
+#ifdef EIGEN_USE_SYCL
+// overload of load constant should be implemented here based on range access
+template <cl::sycl::access::mode AcMd, typename T>
+T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
+ return *address;
+}
+#endif
}
@@ -197,7 +207,9 @@ struct TensorEvaluator<const Derived, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef const Derived XprType;
-
+ typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+ typedef StorageMemory<const Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@@ -221,18 +233,15 @@ struct TensorEvaluator<const Derived, Device>
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockReader;
- // Used for accessor extraction in SYCL Managed TensorMap:
- const Derived& derived() const { return m_impl; }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
- : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
+ : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
- m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+ m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
return false;
}
return true;
@@ -241,13 +250,8 @@ struct TensorEvaluator<const Derived, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
- eigen_assert(m_data);
-#ifndef __SYCL_DEVICE_ONLY__
+ eigen_assert(m_data != NULL);
return loadConstant(m_data+index);
-#else
- CoeffReturnType tmp = m_data[index];
- return tmp;
-#endif
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -269,7 +273,7 @@ struct TensorEvaluator<const Derived, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
- eigen_assert(m_data);
+ eigen_assert(m_data != NULL);
const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
: m_dims.IndexOfRowMajor(coords);
return loadConstant(m_data+index);
@@ -288,16 +292,17 @@ struct TensorEvaluator<const Derived, Device>
TensorBlockReader::Run(block, m_data);
}
- EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
-
- /// added for sycl in order to construct the buffer from the sycl device
- const Device& device() const{return m_device;}
-
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
protected:
- typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
+ EvaluatorPointerType m_data;
Dimensions m_dims;
- const Device& m_device;
- const Derived& m_impl;
+ const Device m_device;
};
@@ -310,16 +315,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
{
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
- enum {
- IsAligned = true,
- PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
- BlockAccess = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
- };
-
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
@@ -331,10 +326,26 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+ #ifdef EIGEN_USE_SYCL
+ && (PacketType<CoeffReturnType, Device>::size >1)
+ #endif
+ ,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
@@ -354,13 +365,14 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
PacketType<CoeffReturnType, Device>::size);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
-
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
- /// required by sycl in order to extract the accessor
- NullaryOp functor() const { return m_functor; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_argImpl.bind(cgh);
+ }
+#endif
private:
const NullaryOp m_functor;
@@ -401,14 +413,15 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
static const int NumDims = internal::array_size<Dimensions>::value;
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
TensorBlock;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_argImpl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -456,16 +469,18 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
arg_block.data());
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
- /// added for sycl in order to construct the buffer from sycl device
- UnaryOp functor() const { return m_functor; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
+ m_argImpl.bind(cgh);
+ }
+#endif
private:
- const Device& m_device;
+ const Device m_device;
const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
@@ -509,6 +524,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
static const int NumDims = internal::array_size<
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
@@ -524,7 +541,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
return m_leftImpl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
return true;
@@ -576,16 +593,17 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
right_block.block_strides(), right_block.data());
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
- /// required by sycl in order to extract the accessor
- BinaryOp functor() const { return m_functor; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+ #endif
private:
- const Device& m_device;
+ const Device m_device;
const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -639,6 +657,8 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
@@ -646,7 +666,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
return m_arg1Impl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_arg1Impl.evalSubExprsIfNeeded(NULL);
m_arg2Impl.evalSubExprsIfNeeded(NULL);
m_arg3Impl.evalSubExprsIfNeeded(NULL);
@@ -679,14 +699,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_arg1Impl.bind(cgh);
+ m_arg2Impl.bind(cgh);
+ m_arg3Impl.bind(cgh);
+ }
+#endif
private:
const TernaryOp m_functor;
@@ -731,6 +753,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
@@ -738,7 +762,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
return m_condImpl.dimensions();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_condImpl.evalSubExprsIfNeeded(NULL);
m_thenImpl.evalSubExprsIfNeeded(NULL);
m_elseImpl.evalSubExprsIfNeeded(NULL);
@@ -757,13 +781,15 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
- internal::Selector<PacketSize> select;
- for (Index i = 0; i < PacketSize; ++i) {
- select.select[i] = m_condImpl.coeff(index+i);
- }
- return internal::pblend(select,
- m_thenImpl.template packet<LoadMode>(index),
- m_elseImpl.template packet<LoadMode>(index));
+ internal::Selector<PacketSize> select;
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < PacketSize; ++i) {
+ select.select[i] = m_condImpl.coeff(index+i);
+ }
+ return internal::pblend(select,
+ m_thenImpl.template packet<LoadMode>(index),
+ m_elseImpl.template packet<LoadMode>(index));
+
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -773,14 +799,16 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_condImpl.bind(cgh);
+ m_thenImpl.bind(cgh);
+ m_elseImpl.bind(cgh);
+ }
+#endif
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
TensorEvaluator<ThenArgType, Device> m_thenImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 7b5842571..47e9b24ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -442,12 +442,133 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
// SYCL Executor policy
#ifdef EIGEN_USE_SYCL
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, SyclDevice, Vectorizable> {
-public:
- static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) {
- // call TensorSYCL module
- TensorSycl::run(expr, device);
+template <bool Vectorizable, typename Evaluator>
+struct ExecExprFunctorKernel_impl {
+ typedef typename Evaluator::Index Index;
+ const Index range;
+ const Index vectorizable_threads;
+ Evaluator evaluator;
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
+ const Index range_, const Index vectorizable_threads_,
+ Evaluator evaluator_)
+ : range(range_), vectorizable_threads(vectorizable_threads_),
+ evaluator(evaluator_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+ operator()(cl::sycl::nd_item<1> itemID) {
+ Index gId = static_cast<Index>(itemID.get_global_linear_id());
+ Index total_threads = itemID.get_global_range(0);
+ EIGEN_UNROLL_LOOP
+ for (Index i = gId; i < range; i += total_threads) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator>
+struct ExecExprFunctorKernel_impl<true, Evaluator> {
+ typedef typename Evaluator::Index Index;
+ const Index range;
+ const Index vectorizable_threads;
+ Evaluator evaluator;
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
+ const Index range_, const Index vectorizable_threads_,
+ Evaluator evaluator_)
+ : range(range_), vectorizable_threads(vectorizable_threads_),
+ evaluator(evaluator_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+ operator()(cl::sycl::nd_item<1> itemID) {
+ Index gId = static_cast<Index>(itemID.get_global_linear_id());
+ if (gId < vectorizable_threads) {
+ const Index PacketSize = Eigen::internal::unpacket_traits<
+ typename Evaluator::PacketReturnType>::size;
+ evaluator.evalPacket(gId * PacketSize);
+ gId += (vectorizable_threads * PacketSize);
+ EIGEN_UNROLL_LOOP
+ for (Index i = gId; i < range; i += vectorizable_threads) {
+ evaluator.evalScalar(i);
+ }
+ }
+ }
+};
+
+template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
+struct ExecExprFunctorKernel
+ : ExecExprFunctorKernel_impl<
+ ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
+ Evaluator> {
+ ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
+ const Evaluator &evaluator)
+ : ExecExprFunctorKernel_impl<
+ ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
+ Evaluator>(range_, vectorizable_threads_, evaluator) {}
+};
+
+template <typename Expr, typename Evaluator>
+struct ExecExprFunctorKernel<Expr, false, Evaluator>
+ : ExecExprFunctorKernel_impl<false, Evaluator> {
+ ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
+ const Evaluator &evaluator)
+ : ExecExprFunctorKernel_impl<false, Evaluator>(
+ range_, vectorizable_threads_, evaluator) {}
+};
+
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
+ public:
+ typedef typename Expression::Index Index;
+ static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
+ Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ Index range, GRange, tileSize;
+ Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+ total_size = (total_size == 0) ? 1 : total_size;
+ const int PacketSize = Eigen::PacketType<
+ typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
+ Eigen::SyclDevice>::size;
+ Index vectorizable_threads =
+ static_cast<Index>(total_size / PacketSize);
+ dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+ range = total_size;
+ auto f = [&](cl::sycl::handler &cgh) {
+ evaluator.bind(cgh);
+ typedef ExecExprFunctorKernel<Expression, true,
+ Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
+ conditional_vectorized_kernel;
+
+ typedef ExecExprFunctorKernel<Expression, false,
+ Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
+ non_vectorized_kernel;
+// This is to make sure that an expression with a size less than vectorized size
+// will not call the vectorized kernel.
+// The reason for having this kernel is that the vectorisable parameter is a
+// compile-time parameter,
+// however, the size of a tensor is a run-time parameter
+ (vectorizable_threads)
+ ? cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ dev.program().template get_kernel<vectorized_kernel>(),
+#endif
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+ cl::sycl::range<1>(tileSize)),
+ conditional_vectorized_kernel(range, vectorizable_threads,
+ evaluator))
+ : cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ dev.program().template get_kernel<non_vectorized_kernel>(),
+#endif
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+ cl::sycl::range<1>(tileSize)),
+ non_vectorized_kernel(range, vectorizable_threads,
+ evaluator));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
+ dev.async_synchronize(e);
+ }
+ evaluator.cleanup();
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 4b6540c07..c9bccfc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -89,7 +89,10 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
- typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType;
+ typedef typename TypeConversion<Scalar,
+ typename XprTraits::PointerType
+ >::type
+ PointerType;
};
template<typename UnaryOp, typename XprType>
@@ -164,9 +167,10 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
static const int Layout = XprTraits::Layout;
typedef typename TypeConversion<Scalar,
typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
- typename traits<LhsXprType>::PointerType,
- typename traits<RhsXprType>::PointerType>::type
- >::type PointerType;
+ typename traits<LhsXprType>::PointerType,
+ typename traits<RhsXprType>::PointerType>::type
+ >::type
+ PointerType;
enum {
Flags = 0
};
@@ -245,9 +249,10 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
static const int Layout = XprTraits::Layout;
typedef typename TypeConversion<Scalar,
typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
- typename traits<Arg2XprType>::PointerType,
- typename traits<Arg3XprType>::PointerType>::type
- >::type PointerType;
+ typename traits<Arg2XprType>::PointerType,
+ typename traits<Arg3XprType>::PointerType>::type
+ >::type
+ PointerType;
enum {
Flags = 0
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 480cf1f39..8d1a6d9cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -131,6 +131,8 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
typedef OutputScalar CoeffReturnType;
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -167,13 +169,13 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
return m_dimensions;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
if (data) {
evalToBuf(data);
return false;
} else {
- m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+ m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
evalToBuf(m_data);
return true;
}
@@ -202,11 +204,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
-
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
private:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
@@ -576,12 +583,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
protected:
Index m_size;
- const FFT& m_fft;
+ const FFT EIGEN_DEVICE_REF m_fft;
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
TensorEvaluator<ArgType, Device> m_impl;
- CoeffReturnType* m_data;
- const Device& m_device;
+ EvaluatorPointerType m_data;
+ const Device EIGEN_DEVICE_REF m_device;
// This will support a maximum FFT size of 2^32 for each dimension
// m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 74b905329..e7b7c1e6b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -78,9 +78,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
};
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+template<typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
{
+ typedef const typename internal::remove_all<ArgType_>::type ArgType;
typedef TensorForcedEvalOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
@@ -88,6 +89,9 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
@@ -106,8 +110,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
TensorBlockReader;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- /// op_ is used for sycl
- : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+ : m_impl(op.expression(), device), m_op(op.expression()),
+ m_device(device), m_buffer(NULL)
{ }
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
@@ -115,17 +119,19 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
const Index numValues = internal::array_prod(m_impl.dimensions());
- m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType));
+ m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+ #ifndef EIGEN_USE_SYCL
// Should initialize the memory in case we're dealing with non POD types.
if (NumTraits<CoeffReturnType>::RequireInitialization) {
for (Index i = 0; i < numValues; ++i) {
new(m_buffer+i) CoeffReturnType();
}
}
+ #endif
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
- EvalTo evalToTmp(m_buffer, m_op);
+ EvalTo evalToTmp(m_device.get(m_buffer), m_op);
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device);
return true;
@@ -159,17 +165,20 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buffer; }
+ EvaluatorPointerType data() const { return m_buffer; }
- /// required by sycl in order to extract the sycl accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
- /// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_buffer.bind(cgh);
+ m_impl.bind(cgh);
+ }
+#endif
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
- const Device& m_device;
- CoeffReturnType* m_buffer;
+ const Device m_device;
+ EvaluatorPointerType m_buffer;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 09b7c994b..ff8a19f87 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -20,17 +20,20 @@ namespace Eigen {
// map_allocator.
template<typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
- typedef T ScalarType;
};
-// The PointerType class is a container of the device specefic pointer
-// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression
+template <typename T>
+EIGEN_STRONG_INLINE T* constCast(const T* data) {
+ return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
// is a device-agnostic type and need MakePointer class for type conversion,
-// the TensorEvaluator calls can be specialized for a device, hence it is possible
+// the TensorEvaluator class can be specialized for a device, hence it is possible
// to construct different types of temproray storage memory in TensorEvaluator
-// for different devices by specializing the following PointerType class.
-template<typename T, typename Device> struct PointerType : MakePointer<T>{};
+// for different devices by specializing the following StorageMemory class.
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
namespace internal{
template<typename A, typename B> struct Pointer_type_promotion {
@@ -39,24 +42,10 @@ template<typename A, typename B> struct Pointer_type_promotion {
template<typename A> struct Pointer_type_promotion<A, A> {
static const bool val = true;
};
-template<typename A, typename B> struct TypeConversion;
-#ifndef __SYCL_DEVICE_ONLY__
-template<typename A, typename B> struct TypeConversion{
+template<typename A, typename B> struct TypeConversion {
typedef A* type;
};
-#endif
-}
-
-#if defined(EIGEN_USE_SYCL)
-namespace TensorSycl {
-namespace internal{
-template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor;
-}
}
-#endif
-
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
@@ -113,6 +102,31 @@ struct ThreadPoolDevice;
struct GpuDevice;
struct SyclDevice;
+#ifdef EIGEN_USE_SYCL
+
+template <typename T> struct MakeSYCLPointer {
+ typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
+};
+
+template <typename T>
+EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
+constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
+ return data;
+}
+
+template <typename T>
+struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
+template <typename T>
+struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
+
+namespace TensorSycl {
+namespace internal{
+template <typename Evaluator, typename Op> class ReductionFunctor;
+}
+}
+#endif
+
+
enum FFTResultType {
RealPart = 0,
ImagPart = 1,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 51572f9e7..2edc45f1a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -421,6 +421,7 @@ class GaussianGenerator {
const array<T, NumDims>& std_devs)
: m_means(means)
{
+ EIGEN_UNROLL_LOOP
for (size_t i = 0; i < NumDims; ++i) {
m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
}
@@ -428,6 +429,7 @@ class GaussianGenerator {
EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0);
+ EIGEN_UNROLL_LOOP
for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i];
tmp += offset * offset / m_two_sigmas[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 204a6fd33..b7ad33626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -88,6 +88,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
@@ -104,22 +106,21 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
TensorBlock;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_device(device), m_generator(op.generator())
-#ifdef EIGEN_USE_SYCL
- , m_argImpl(op.expression(), device)
-#endif
+ : m_device(device), m_generator(op.generator())
{
TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
m_dimensions = argImpl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
}
} else {
m_strides[NumDims - 1] = 1;
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
@@ -129,7 +130,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -234,11 +235,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
TensorOpCost::MulCost<Scalar>());
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
#endif
protected:
@@ -261,14 +262,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
}
}
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
array<IndexDivisor, NumDims> m_fast_strides;
Generator m_generator;
-#ifdef EIGEN_USE_SYCL
- TensorEvaluator<ArgType, Device> m_argImpl;
-#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 965bd8f1e..5ff67bdae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -154,23 +154,6 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
m_padding_left(padding_left), m_padding_right(padding_right),
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
-#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
- DenseIndex row_strides, DenseIndex col_strides,
- DenseIndex in_row_strides, DenseIndex in_col_strides,
- DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
- bool padding_explicit, DenseIndex padding_top, DenseIndex padding_bottom,
- DenseIndex padding_left, DenseIndex padding_right, PaddingType padding_type,
- Scalar padding_value)
- : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
- m_padding_explicit(padding_explicit), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
- m_padding_left(padding_left), m_padding_right(padding_right),
- m_padding_type(padding_type), m_padding_value(padding_value) {}
-
-#endif
EIGEN_DEVICE_FUNC
DenseIndex patch_rows() const { return m_patch_rows; }
@@ -242,6 +225,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -256,15 +241,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
OutputTensorBlock;
-#ifdef __SYCL_DEVICE_ONLY__
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
- #else
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
- #endif
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
: m_device(device), m_impl(op.expression(), device)
-#ifdef EIGEN_USE_SYCL
- , m_op(op)
-#endif
{
EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -410,7 +388,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -516,13 +494,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
return packetWithPossibleZero(index);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
- // Required by SYCL in order to construct the expression tree on the device
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
#endif
Index rowPaddingTop() const { return m_rowPaddingTop; }
@@ -693,6 +673,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
@@ -744,12 +725,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
Scalar m_paddingValue;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
TensorEvaluator<ArgType, Device> m_impl;
- #ifdef EIGEN_USE_SYCL
- // Required for SYCL in order to construct the expression tree on the device
- XprType m_op;
- #endif
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 32caccf87..be8f3a734 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -353,6 +353,7 @@ namespace internal {
template<typename FirstType, typename... OtherTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
Index result = 1;
+ EIGEN_UNROLL_LOOP
for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i];
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index e28565009..f8cda6574 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -86,6 +86,8 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@@ -131,7 +133,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -146,6 +148,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
eigen_assert(index < dimensions().TotalSize());
*inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -160,6 +163,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
*inputIndex += index / m_strides[0];
return true;
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -195,6 +199,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
@@ -215,11 +220,13 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
#endif
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index b6d445c50..e7fec5d3a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -37,7 +37,7 @@ namespace {
{
#ifdef EIGEN_GPU_COMPILE_PHASE
return __clz(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC
unsigned long index;
@@ -55,8 +55,8 @@ namespace {
{
#ifdef EIGEN_GPU_COMPILE_PHASE
return __clzll(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+ return static_cast<int>(cl::sycl::clz(val));
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
unsigned long index;
_BitScanReverse64(&index, val);
@@ -92,7 +92,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __umulhi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
#else
return (static_cast<uint64_t>(a) * b) >> 32;
@@ -103,7 +103,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __umul64hi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
#elif defined(__SIZEOF_INT128__)
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
@@ -124,7 +124,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@@ -205,8 +205,8 @@ class TensorIntDivisor<int32_t, true> {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef EIGEN_GPU_COMPILE_PHASE
return (__umulhi(magic, n) >> shift);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+ return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
#else
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 998757d14..755170a34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -134,13 +134,22 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
}
}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
return m_impl.evalSubExprsIfNeeded(data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -162,7 +171,9 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
return m_impl.costPerCoeff(vectorized);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_impl.data(); }
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+ return constCast(m_impl.data());
+ }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -192,7 +203,7 @@ template<typename ArgType, typename Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
-
+
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index c6ca396a3..3d859f42d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -59,4 +59,39 @@
#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
#endif
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+ #define EIGEN_DEVICE_REF
+#else
+ #define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#if defined(EIGEN_EXCEPTIONS)
+ #define EIGEN_SYCL_TRY_CATCH(X) \
+ do { \
+ try { X; } \
+ catch(const cl::sycl::exception& e) { \
+ std::cerr << "SYCL exception at " \
+ << __FILE__ << ":" << __LINE__ << std::endl \
+ << e.what() << std::endl; \
+ std::rethrow_exception(std::current_exception()); \
+ } \
+ } while (false)
+#else
+ #define EIGEN_SYCL_TRY_CATCH(X) X
+#endif
+
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+ (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM))
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 28f629080..395cdf9c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -31,8 +31,12 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
public:
typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
typedef typename PlainObjectType::Base Base;
- typedef typename Eigen::internal::nested<Self>::type Nested;
- typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+ #ifdef EIGEN_USE_SYCL
+ typedef typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
+ #else
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ #endif
+ typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
typedef typename internal::traits<PlainObjectType>::Index Index;
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 87be090f9..6afc98877 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -85,9 +85,57 @@ struct PacketType<half, GpuDevice> {
#endif
#if defined(EIGEN_USE_SYCL)
-template <typename T>
- struct PacketType<T, SyclDevice> {
- typedef T type;
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B> struct PlusOp {
+ static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B> struct DivOp {
+ static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step,
+ template <class Indx, Indx...> class StepOp>
+struct static_for {
+ template <typename UnaryOperator>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+ op(start);
+ static_for<Index, StepOp<Index, start, step>::Value, end, step,
+ StepOp>::loop(op);
+ }
+};
+template <typename Index, Index end, Index step,
+ template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+ template <typename UnaryOperator>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+ static const int PacketSize = 1;
+ typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+ static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+ typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
+ return ((((x) + (y)-1) / (y)) * (y));
+}
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <>
+ struct PacketType<half, SyclDevice> {
+ typedef half type;
static const int size = 1;
enum {
HasAdd = 0,
@@ -104,9 +152,59 @@ template <typename T>
HasBlend = 0
};
};
-#endif
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+ typedef Scalar type;
+ typedef Scalar half;
+ enum {
+ Vectorizable = 0,
+ size = 1,
+ AlignedOnScalar = 0,
+ HasHalfPacket = 0
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
+template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
+{\
+ typedef typename internal::packet_traits<Type>::type type;\
+ typedef typename internal::packet_traits<Type>::half half;\
+};
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
+
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
+template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
+#endif
+#endif
+
// Tuple mimics std::pair but works on e.g. nvcc.
template <typename U, typename V> struct Tuple {
public:
@@ -124,7 +222,7 @@ template <typename U, typename V> struct Tuple {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Tuple& operator= (const Tuple& rhs) {
- #ifndef __SYCL_DEVICE_ONLY__
+ #ifndef SYCL_DEVICE_ONLY
if (&rhs == this) return *this;
#endif
first = rhs.first;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 5352c8f7b..8f6e987b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -37,7 +37,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
template<typename NewDimensions, typename XprType>
struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
{
- typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+ typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
};
template<typename NewDimensions, typename XprType>
@@ -106,6 +106,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
static const int NumOutputDims = internal::array_size<Dimensions>::value;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -168,7 +171,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
return m_impl.evalSubExprsIfNeeded(data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -326,10 +329,18 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
}
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+ return constCast(m_impl.data());
+ }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+ #endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
NewDimensions m_dimensions;
@@ -404,7 +415,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
template<typename StartIndices, typename Sizes, typename XprType>
struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
{
- typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+ typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
};
template<typename StartIndices, typename Sizes, typename XprType>
@@ -488,7 +499,7 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> {
// It is very expensive to start the memcpy kernel on GPU: we therefore only
// use it for large copies.
#ifdef EIGEN_USE_SYCL
-template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> {
+template <typename Index> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice> {
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
};
@@ -508,6 +519,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Sizes Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -575,7 +589,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
&& data && m_impl.data()
@@ -599,10 +613,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
// Use memcpy if it's going to be faster than using the regular evaluation.
const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
if (trigger(contiguous_values)) {
- Scalar* src = (Scalar*)m_impl.data();
+ EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
Index offset = srcCoeff(i);
- m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+ m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
}
return false;
}
@@ -637,6 +651,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_fastOutputStrides[i];
const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -648,6 +663,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
inputIndices[0] += (indices[0] + m_offsets[0]);
inputIndices[1] += (indices[1] + m_offsets[0]);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_fastOutputStrides[i];
const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -667,6 +683,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < packetSize-1; ++i) {
values[i] = coeff(index+i);
}
@@ -698,8 +715,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
m_impl.block(&input_block);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
- Scalar* result = const_cast<Scalar*>(m_impl.data());
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+ typename Storage::Type result = constCast(m_impl.data());
if (result) {
Index offset = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -733,19 +750,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
return NULL;
}
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{
- return m_impl;
- }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{
- return m_offsets;
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
}
+#endif
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -753,6 +770,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
inputIndex += (index + m_offsets[0]);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -767,7 +785,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
Dimensions m_dimensions;
bool m_is_identity;
const StartIndices m_offsets;
@@ -829,6 +847,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -840,6 +859,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
inputIndices[0] += (indices[0] + this->m_offsets[0]);
inputIndices[1] += (indices[1] + this->m_offsets[0]);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -859,6 +879,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < packetSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
@@ -892,7 +913,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
{
- typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+ typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
};
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -969,6 +990,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
typedef Strides Dimensions;
enum {
@@ -985,8 +1008,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_device(device),
- m_strides(op.strides()), m_exprStartIndices(op.startIndices()),
- m_exprStopIndices(op.stopIndices())
+ m_strides(op.strides())
{
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
@@ -1069,7 +1091,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -1091,30 +1113,28 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
return NULL;
}
-
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i >= 0; --i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i] + m_offsets[i];
index -= idx * m_outputStrides[i];
}
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims; ++i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@@ -1125,7 +1145,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef __SYCL_DEVICE_ONLY__
+#ifndef SYCL_DEVICE_ONLY
return numext::maxi(min, numext::mini(max,value));
#else
return cl::sycl::clamp(value, min, max);
@@ -1137,15 +1157,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
array<Index, NumDims> m_inputStrides;
bool m_is_identity;
TensorEvaluator<ArgType, Device> m_impl;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
DSizes<Index, NumDims> m_startIndices; // clamped startIndices
DSizes<Index, NumDims> m_dimensions;
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
const Strides m_strides;
- //use by sycl
- const StartIndices m_exprStartIndices;
- //use by sycl
- const StopIndices m_exprStopIndices;
};
// Eval as lvalue
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 4837f2200..e98382cc1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -92,6 +92,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
@@ -138,7 +140,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -151,6 +153,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (isPaddingAtIndexForDim(idx, i)) {
@@ -164,6 +167,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
}
inputIndex += (index - m_padding[0].first);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i+1];
if (isPaddingAtIndexForDim(idx, i)) {
@@ -192,23 +196,25 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims; ++i)
updateCostPerDimension(cost, i, i == 0);
} else {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i >= 0; --i)
updateCostPerDimension(cost, i, i == NumDims - 1);
}
return cost;
}
- EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
private:
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
@@ -272,6 +278,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index initialIndex = index;
Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index firstIdx = index;
const Index lastIdx = index + PacketSize - 1;
@@ -329,7 +336,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index initialIndex = index;
Index inputIndex = 0;
-
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index firstIdx = index;
const Index lastIdx = index + PacketSize - 1;
@@ -383,6 +390,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 4292fe0c2..47db839db 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -89,6 +89,8 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
@@ -103,9 +105,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
-#ifdef EIGEN_USE_SYCL
- , m_patch_dims(op.patch_dims())
-#endif
{
Index num_patches = 1;
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -149,7 +148,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -167,6 +166,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
@@ -175,6 +175,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
}
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 2; ++i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
@@ -202,6 +203,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
Index inputIndices[2] = {0, 0};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
patchIndices[1] / m_patchStrides[i]};
@@ -217,6 +219,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
}
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 2; ++i) {
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
patchIndices[1] / m_patchStrides[i]};
@@ -243,6 +246,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
@@ -259,11 +263,13 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
#endif
protected:
@@ -274,9 +280,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
TensorEvaluator<ArgType, Device> m_impl;
-#ifdef EIGEN_USE_SYCL
- const PatchDim m_patch_dims;
-#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 787cbd031..2be4f9cc5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -44,6 +45,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
uint64_t rnd = ::random() ^ mach_absolute_time();
return rnd;
+
#else
// Augment the current time with pseudo random number generation
// to ensure that we get different seeds if we try to generate seeds
@@ -147,14 +149,41 @@ template <typename T> class UniformRandomGenerator {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
+ #ifdef EIGEN_USE_SYCL
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // Therefor, we need two step to initializate the m_state.
+ // IN SYCL, the constructor of the functor is s called on the CPU
+ // and we get the clock seed here from the CPU. However, This seed is
+ //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+ // and only available on the Operator() function (which is called on the GPU).
+ // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+ // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+ // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+ // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+ //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
+ m_exec_once =false;
+ #endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
const UniformRandomGenerator& other) {
m_state = other.m_state;
+ #ifdef EIGEN_USE_SYCL
+ m_exec_once =other.m_exec_once;
+ #endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
T result = RandomToTypeUniform<T>(&m_state, i);
return result;
}
@@ -163,6 +192,14 @@ template <typename T> class UniformRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ EIGEN_UNROLL_LOOP
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeUniform<T>(&m_state, i);
}
@@ -171,6 +208,9 @@ template <typename T> class UniformRandomGenerator {
private:
mutable uint64_t m_state;
+ #ifdef EIGEN_USE_SYCL
+ mutable bool m_exec_once;
+ #endif
};
template <typename Scalar>
@@ -222,14 +262,37 @@ template <typename T> class NormalRandomGenerator {
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
+ #ifdef EIGEN_USE_SYCL
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // Therefor, we need two steps to initializate the m_state.
+ // IN SYCL, the constructor of the functor is s called on the CPU
+ // and we get the clock seed here from the CPU. However, This seed is
+ //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+ // and only available on the Operator() function (which is called on the GPU).
+ // Therefore, the thread Id injection is not available at this stage. However when the operator()
+ //is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
+ m_exec_once =false;
+ #endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
const NormalRandomGenerator& other) {
m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+ m_exec_once=other.m_exec_once;
+#endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
T result = RandomToTypeNormal<T>(&m_state, i);
return result;
}
@@ -238,6 +301,14 @@ template <typename T> class NormalRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ EIGEN_UNROLL_LOOP
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeNormal<T>(&m_state, i);
}
@@ -246,6 +317,9 @@ template <typename T> class NormalRandomGenerator {
private:
mutable uint64_t m_state;
+ #ifdef EIGEN_USE_SYCL
+ mutable bool m_exec_once;
+ #endif
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index bb63433fe..5dddfcf85 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -299,7 +299,7 @@ template <typename Self, typename Op, typename Device, bool Vectorizable = (Self
struct FullReducer {
static const bool HasOptimizedImplementation = false;
- static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
}
@@ -400,6 +400,18 @@ struct OuterReducer {
}
};
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
+ return true;
+ }
+};
+#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
@@ -423,6 +435,23 @@ template <int NPT, typename S, typename R, typename I_>
__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if EIGEN_HAS_CXX11 && defined(EIGEN_USE_SYCL)
+ typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
+#else
+ typedef typename remove_const<CoeffReturnType>::type type;
+#endif
+};
+
template <typename Self, typename Op,
bool Vectorizable =
(Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
@@ -520,12 +549,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
const Op m_reducer;
};
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
// Eval as rvalue
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
{
typedef internal::reducer_traits<Op, Device> ReducerTraits;
+ typedef Dims ReducedDims;
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
typedef typename XprType::Index Index;
typedef ArgType ChildType;
@@ -535,12 +567,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
static const int NumOutputDims = NumInputDims - NumReducedDims;
typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
typedef typename XprType::Scalar Scalar;
- typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
- typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ // Subset of strides of the input tensor for the non-reduced dimensions.
+ // Indexed by output dimensions.
+ static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+
enum {
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
@@ -562,11 +602,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
-#if defined(EIGEN_USE_SYCL)
- , m_xpr_dims(op.dims())
-#endif
{
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -653,7 +690,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// of which will eventually result in an NVCC error
EIGEN_DEVICE_FUNC
#endif
- bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+ bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
@@ -663,7 +700,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
!RunningOnGPU))) {
bool need_assign = false;
if (!data) {
- m_result = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType)));
+ m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
data = m_result;
need_assign = true;
}
@@ -671,20 +708,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
return need_assign;
}
- else if(RunningOnSycl){
- const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
- const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
- if (!data) {
- data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
- m_result = data;
- }
- Op reducer(m_reducer);
- internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
- return (m_result != NULL);
- }
// Attempt to use an optimized reduction.
- else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+ else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
bool reducing_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -698,8 +724,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
if (!data) {
- if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
- data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
m_result = data;
}
else {
@@ -707,6 +733,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
}
}
Op reducer(m_reducer);
+ // For SYCL this if always return false
if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
if (m_result) {
m_device.deallocate_temp(m_result);
@@ -731,8 +758,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
if (!data) {
- if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
- data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
m_result = data;
}
else {
@@ -740,6 +767,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
}
}
Op reducer(m_reducer);
+ // For SYCL this if always return false
if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
if (m_result) {
m_device.deallocate_temp(m_result);
@@ -750,6 +778,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
return (m_result != NULL);
}
}
+ #if defined(EIGEN_USE_SYCL)
+ // If there is no Optimised version for SYCL, the reduction expression
+ // must break into two subexpression and use the SYCL generic Reducer on the device.
+ if(RunningOnSycl) {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+ m_result = data;
+ }
+ Op reducer(m_reducer);
+ internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ return (m_result != NULL);
+ }
+ #endif
}
return true;
}
@@ -764,7 +807,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+ if (( RunningFullReduction || RunningOnGPU) && m_result ) {
return *(m_result + index);
}
Op reducer(m_reducer);
@@ -1097,12 +1140,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
m_device.deallocate(reducers);
}
- EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
-
-#if defined(EIGEN_USE_SYCL)
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- const Device& device() const { return m_device; }
- const Dims& xprDims() const { return m_xpr_dims; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_result.bind(cgh);
+ }
#endif
private:
@@ -1126,8 +1172,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#endif
#if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
#endif
@@ -1255,9 +1302,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// Precomputed strides for the output tensor.
array<Index, NumOutputDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
- // Subset of strides of the input tensor for the non-reduced dimensions.
- // Indexed by output dimensions.
- static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
array<Index, NumPreservedStrides> m_preservedStrides;
// Map from output to input dimension index.
array<Index, NumOutputDims> m_output_to_input_dim_map;
@@ -1288,13 +1332,36 @@ static const bool RunningOnGPU = false;
static const bool RunningOnGPU = false;
static const bool RunningOnSycl = false;
#endif
- typename MakePointer_<CoeffReturnType>::Type m_result;
+ EvaluatorPointerType m_result;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
+};
-#if defined(EIGEN_USE_SYCL)
- const Dims m_xpr_dims;
-#endif
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+ // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+ //Therefore the coeff function should be overridden by for SYCL kernel
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+ return *(this->data() + index);
+ }
+ // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+ //Therefore the packet function should be overridden by for SYCL kernel
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+ return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+ }
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 6e15e75f9..b92c9ffaf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t
public:
// typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef TensorEvaluator<Expr, Device> EvalType;
TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
m_dims = m_impl.dimensions();
@@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension
public:
typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
typedef typename Base::Scalar Scalar;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
}
@@ -362,6 +367,8 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -379,7 +386,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
return true;
}
@@ -394,7 +401,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
}
EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
-
+
protected:
TensorRef<Derived> m_ref;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index b7fb969f3..3a699095c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -145,7 +147,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -158,6 +160,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
Index idx = index / m_strides[i];
index -= idx * m_strides[i];
@@ -172,6 +175,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
inputIndex += index;
}
} else {
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
Index idx = index / m_strides[i];
index -= idx * m_strides[i];
@@ -205,6 +209,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
// local structure.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
@@ -225,12 +230,14 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
- /// added for sycl in order to construct the buffer from sycl device
- ReverseDimensions functor() const { return m_reverse; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
Dimensions m_dimensions;
@@ -269,7 +276,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
@@ -285,11 +292,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
// This code is pilfered from TensorMorphing.h
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
-
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 64f10d0a4..44156126d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -86,12 +86,15 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
typedef TensorScanOp<Op, ArgType> XprType;
typedef typename XprType::Index Index;
+ typedef const ArgType ChildType;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -110,7 +113,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
m_exclusive(op.exclusive()),
m_accumulator(op.accumulator()),
m_size(m_impl.dimensions()[op.axis()]),
- m_stride(1),
+ m_stride(1), m_consume_dim(op.axis()),
m_output(NULL) {
// Accumulating a scalar isn't supported.
@@ -142,6 +145,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return m_stride;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
+ return m_consume_dim;
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
return m_size;
}
@@ -162,7 +169,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return m_device;
}
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
ScanLauncher<Self, Op, Device> launcher;
if (data) {
@@ -171,7 +178,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
}
const Index total_size = internal::array_prod(dimensions());
- m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+ m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
launcher(*this, m_output);
return true;
}
@@ -181,7 +188,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
{
return m_output;
}
@@ -196,21 +203,29 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- if (m_output != NULL) {
- m_device.deallocate(m_output);
+ if (m_output) {
+ m_device.deallocate_temp(m_output);
m_output = NULL;
}
m_impl.cleanup();
}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_output.bind(cgh);
+ }
+#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
const bool m_exclusive;
Op m_accumulator;
const Index m_size;
Index m_stride;
- CoeffReturnType* m_output;
+ Index m_consume_dim;
+ EvaluatorPointerType m_output;
};
// CPU implementation of scan
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 416948765..ae04785ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -130,8 +132,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_device(device),
- m_impl(op.expression(), device),
- m_shuffle(op.shufflePermutation())
+ m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation();
@@ -172,7 +173,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -194,6 +195,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
static PacketReturnType Run(const Self& self, Index index) {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = self.coeff(index + i);
}
@@ -210,6 +212,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
return self.m_impl.template packet<LoadMode>(index);
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = self.coeff(index + i);
}
@@ -330,13 +333,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
-
- // required by sycl
- EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
- // required by sycl
- EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
Index input_index,
@@ -389,10 +393,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_unshuffledInputStrides;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
TensorEvaluator<ArgType, Device> m_impl;
- /// required by sycl
- Shuffle m_shuffle;
};
@@ -444,6 +446,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 221dc96c9..3b1cbaabc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -37,7 +37,7 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
template<typename Strides, typename XprType>
struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
{
- typedef const TensorStridingOp<Strides, XprType>& type;
+ typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
};
template<typename Strides, typename XprType>
@@ -108,6 +108,8 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
@@ -120,7 +122,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_strides(op.strides())
+ : m_impl(op.expression(), device)
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
@@ -149,9 +151,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
}
}
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -173,6 +176,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
@@ -184,6 +188,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
inputIndices[0] += indices[0] * m_inputStrides[0];
inputIndices[1] += indices[1] * m_inputStrides[0];
} else { // RowMajor
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
@@ -203,6 +208,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
@@ -225,18 +231,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
-
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- /// required by sycl in order to extract the accessor
- Strides functor() const { return m_strides; }
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
@@ -244,6 +252,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
}
inputIndex += index * m_inputStrides[0];
} else { // RowMajor
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
@@ -258,7 +267,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- const Strides m_strides;
};
// Eval as lvalue
@@ -296,11 +304,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
return this->m_impl.coeffRef(this->srcCoeff(index));
}
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
-
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
@@ -310,6 +313,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -321,6 +325,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
inputIndices[0] += indices[0] * this->m_inputStrides[0];
inputIndices[1] += indices[1] * this->m_inputStrides[0];
} else { // RowMajor
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -340,6 +345,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
internal::pstore<Scalar, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+ EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 9fc54a4c0..d04b1bea7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -91,6 +91,8 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -205,7 +207,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
return m_dimensions;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -249,6 +251,13 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
return result;
}
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
protected:
// Given the output index, finds the first index in the input tensor used to compute the trace
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -276,7 +285,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
TensorEvaluator<ArgType, Device> m_impl;
// Initialize the size of the trace dimension
Index m_traceDim;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
array<bool, NumInputDims> m_reduced;
array<Index, NumReducedDims> m_reducedDims;
array<Index, NumOutputDims> m_outputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 0a394c88d..4aec83cff 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -58,9 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
- typedef T ScalarType;
-
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
@@ -80,9 +77,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
- typedef T ScalarType;
-
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
@@ -106,10 +100,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
- typedef typename MakePointerT::ScalarType ScalarType;
-
-
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
@@ -135,49 +125,49 @@ struct traits<TensorRef<PlainObjectType> >
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
{
- typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+ typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
{
- typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+ typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
{
- typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
{
- typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType, int Options, template <class> class MakePointer>
struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
{
- typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType, int Options, template <class> class MakePointer>
struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
{
- typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType>
struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
{
- typedef const TensorRef<PlainObjectType>& type;
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType>
struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
{
- typedef const TensorRef<PlainObjectType>& type;
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
@@ -189,50 +179,50 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
- typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+ typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
- typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+ typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
{
- typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
{
- typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType, int Options, template <class> class MakePointer>
struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
{
- typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType, int Options, template <class> class MakePointer>
struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
{
- typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType>
struct nested<TensorRef<PlainObjectType> >
{
- typedef const TensorRef<PlainObjectType>& type;
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType>
struct nested<const TensorRef<PlainObjectType> >
{
- typedef const TensorRef<PlainObjectType>& type;
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
} // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index c1b7a58ca..29a2d5538 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -91,24 +91,6 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
m_padding_left(padding_left), m_padding_right(padding_right),
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
-#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
- DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
- DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
- DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
- bool padding_explicit, DenseIndex padding_top_z, DenseIndex padding_bottom_z,
- DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left,
- DenseIndex padding_right, PaddingType padding_type, Scalar padding_value)
- : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides),
- m_col_inflate_strides(col_inflate_strides), m_padding_explicit(padding_explicit), m_padding_top_z(padding_top_z),
- m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), m_padding_left(padding_left),
- m_padding_right(padding_right), m_padding_type(padding_type), m_padding_value(padding_value) {}
-
-#endif
-
EIGEN_DEVICE_FUNC
DenseIndex patch_planes() const { return m_patch_planes; }
EIGEN_DEVICE_FUNC
@@ -195,6 +177,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
@@ -205,16 +189,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
CoordAccess = false,
RawAccess = false
};
-#ifdef __SYCL_DEVICE_ONLY__
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
-#else
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
-#endif
- : m_impl(op.expression(), device)
-#ifdef EIGEN_USE_SYCL
- , m_op(op)
-#endif
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+ m_impl(op.expression(), device)
{
EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -368,9 +345,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
}
}
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
@@ -531,14 +509,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-#ifdef EIGEN_USE_SYCL
- // Required by SYCL in order to construct the expression on the device
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
-#endif
Index planePaddingTop() const { return m_planePaddingTop; }
Index rowPaddingTop() const { return m_rowPaddingTop; }
@@ -556,10 +530,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
Index rowInflateStride() const { return m_row_inflate_strides; }
Index colInflateStride() const { return m_col_inflate_strides; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
@@ -635,10 +616,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
TensorEvaluator<ArgType, Device> m_impl;
-#ifdef EIGEN_USE_SYCL
-// Required by SYCL in order to construct the expression on the device
- XprType m_op;
-#endif
};
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 162dab7b8..8ea03382d 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -527,26 +527,32 @@ static void test_execute_generator_op(Device d)
}
}
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
#define CALL_SUBTEST_PART(PART) \
CALL_SUBTEST_##PART
#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, true, false, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, true, true, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, true, false, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, true, true, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true, true, RowMajor>(tp_device)))
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index 4cbe15b63..eb708737d 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -51,7 +51,8 @@ static void test_static_reshape() {
// New dimensions: [2, 3, 7]
Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
- Tensor<float, 3> reshaped = tensor.reshape(dim);
+ Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<long,3>>(dim));
+
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {