aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h242
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h476
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h152
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h41
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h19
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h43
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h14
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h52
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h84
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h35
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h120
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h14
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h27
-rw-r--r--unsupported/test/CMakeLists.txt3
-rw-r--r--unsupported/test/cxx11_tensor_broadcast_sycl.cpp5
-rw-r--r--unsupported/test/cxx11_tensor_builtins_sycl.cpp169
-rw-r--r--unsupported/test/cxx11_tensor_concatenation_sycl.cpp110
-rw-r--r--unsupported/test/cxx11_tensor_contract_sycl.cpp228
-rw-r--r--unsupported/test/cxx11_tensor_convolution_sycl.cpp469
-rw-r--r--unsupported/test/cxx11_tensor_device_sycl.cpp28
-rw-r--r--unsupported/test/cxx11_tensor_forced_eval_sycl.cpp34
-rw-r--r--unsupported/test/cxx11_tensor_morphing_sycl.cpp119
-rw-r--r--unsupported/test/cxx11_tensor_padding_sycl.cpp22
-rw-r--r--unsupported/test/cxx11_tensor_reduction_sycl.cpp136
-rw-r--r--unsupported/test/cxx11_tensor_reverse_sycl.cpp221
-rw-r--r--unsupported/test/cxx11_tensor_shuffling_sycl.cpp49
-rw-r--r--unsupported/test/cxx11_tensor_striding_sycl.cpp203
-rw-r--r--unsupported/test/cxx11_tensor_sycl.cpp151
37 files changed, 2577 insertions, 744 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index d32c20b5e..bf4a476d9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -220,7 +220,7 @@ struct TensorContractionEvaluatorBase
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.rhsExpression(), op.lhsExpression()), device),
m_device(device),
- m_result(NULL), m_expr_indices(op.indices()) {
+ m_result(NULL) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -844,9 +844,6 @@ protected:
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
- /// required for sycl
- const Indices m_expr_indices;
-
bool m_can_use_xsmm;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index b170a1a5c..e87de0c57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -22,7 +22,7 @@
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
namespace Eigen {
-template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
+template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
@@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
- LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
- this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
- this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
+ LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
+ this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
+ this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
}
// required by sycl to construct the expr on the device. Returns original left_impl
const TensorEvaluator<LeftArgType, Device>& left_impl() const {
@@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const TensorEvaluator<RightArgType, Device>& right_impl() const {
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
}
- // required by sycl to construct the expr on the device
- const Indices& indices() const {return this->m_expr_indices;}
};
-/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
-template<typename Expr> struct TensorEvaluatorContainer;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
- typedef Eigen::DefaultDevice Device;
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
- typedef typename XprType::Index Index;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
- enum {
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- };
-
- typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
- TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
- : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
- op.lhsExpression(), op.rhsExpression()), device),
- m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
- op.rhsExpression(), op.lhsExpression()), device){}
-LeftEvaluator m_leftImpl;
-RightEvaluator m_rightImpl;
-};
-
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
+template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
-int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
-
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-
- FunctorExpr functors;
+typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
+typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
+ typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+ typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+ typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
+ typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
+ LHSFunctorExpr lhs_functors;
+ RHSFunctorExpr rhs_functors;
LhsLocalAcc localLhs;
RhsLocalAcc localRhs;
OutAccessor out_res;
@@ -206,119 +177,130 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
LeftNocontractT m_i_strides, m_left_nocontract_strides;
RightNocontractT m_j_strides, m_right_nocontract_strides;
- TupleType tuple_of_accessors;
+ LHSTupleType left_tuple_of_accessors;
+ RHSTupleType right_tuple_of_accessors;
+ Device dev;
+
- KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+ KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
- LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
- :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+ LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
+ :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
m_right_contracting_strides(m_right_contracting_strides_),
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
- tuple_of_accessors(tuple_of_accessors_){}
+ left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
- typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
+ typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+ typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
+ typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
+ auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
+ auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
+ typedef decltype(lhs_dev_expr.expr) LeftArgType;
+ typedef decltype(rhs_dev_expr.expr) RightArgType;
+ typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- typename DevEvaluator::LeftEvaluator, LeftNocontractT,
+ LeftEvaluator, LeftNocontractT,
ContractT, 1,
lhs_inner_dim_contiguous,
false, Unaligned, MakeGlobalPointer> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- typename DevEvaluator::RightEvaluator, RightNocontractT,
+ RightEvaluator, RightNocontractT,
ContractT, 1,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
// initialize data mappers must happen inside the kernel for device eval
- LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
- RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
+ LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+ lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
+ RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+ rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
// Matmul Kernel
// Thread identifiers
- const int mLocalThreadId = itemID.get_local(0); // Local ID row
- const int nLocalThreadId = itemID.get_local(1); // Local ID col
- const int mGroupId = itemID.get_group(0); // Work-group ID row
- const int nGroupId = itemID.get_group(1); // Work-group ID localCol
- const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
+ const Index mLocalThreadId = itemID.get_local(0); // Local ID row
+ const Index nLocalThreadId = itemID.get_local(1); // Local ID col
+ const Index mGroupId = itemID.get_group(0); // Work-group ID row
+ const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
+ const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
// Allocate register space
float privateLhs;
float privateRhs[WorkLoadPerThreadN];
float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
// Initialise the privateResumulation registers
- for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+ for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+ for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] = 0.0f;
}
}
// Tile Lhs
- for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- int
- localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- int localLhsRow = localLhsLinearId% TileSizeDimM;
- int localLhsCol = localLhsLinearId/TileSizeDimM;
+ for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+ Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+ Index localLhsRow = localLhsLinearId% TileSizeDimM;
+ Index localLhsCol = localLhsLinearId/TileSizeDimM;
// Load the value (wide vector load)
- int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
+ Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
}
// Tile Rhs
- for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- int localRhsRow = localRhsLinearId% TileSizeDimN;
- int localRhsCol = localRhsLinearId/TileSizeDimN;
+ for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+ Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+ Index localRhsRow = localRhsLinearId% TileSizeDimN;
+ Index localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
- int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
+ Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
}
// Loop over all tiles
- const int numTiles = roundUpK/TileSizeDimK;
- int firstHalf=0;
+ const Index numTiles = roundUpK/TileSizeDimK;
+ Index firstHalf=0;
do {
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// Load the next tile of Lhs and Rhs into local memory
- int nextHalf = firstHalf + 1;
+ Index nextHalf = firstHalf + 1;
if (nextHalf < numTiles) {
// Tile A
- for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- int localLhsRow = localLhsLinearId% TileSizeDimM;
- int localLhsCol = localLhsLinearId/TileSizeDimM;
+ for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+ Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+ Index localLhsRow = localLhsLinearId% TileSizeDimM;
+ Index localLhsCol = localLhsLinearId/TileSizeDimM;
// global K id
- int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
+ Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
// Store the loaded value into local memory
localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
}
// Tile B
- for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- int localRhsRow = localRhsLinearId% TileSizeDimN;
- int localRhsCol = localRhsLinearId/TileSizeDimN;
+ for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+ Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+ Index localRhsRow = localRhsLinearId% TileSizeDimN;
+ Index localRhsCol = localRhsLinearId/TileSizeDimN;
// Load the value (wide vector load)
- int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
+ Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
// Store the loaded vector into local memory
localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
}
}
// Loop over the values of a single tile
- for (int k=0; k<TileSizeDimK; k++) {
+ for (Index k=0; k<TileSizeDimK; k++) {
// Cache the values of localRhs in registers
- for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
+ for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+ Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
}
// Perform the computation
- for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
+ for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+ Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
- for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+ for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
}
}
@@ -327,13 +309,12 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
firstHalf++;
} while (firstHalf<numTiles);
-
// Store the final results in C
- for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
+ for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+ Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
if (globalRow< M){
- for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
+ for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+ Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
}
@@ -343,56 +324,73 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
}
};
-template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const int TileSizeDimM = 32; // Tile size for dimension M
-static const int TileSizeDimN = 32; // Tile size for dimension N
-static const int TileSizeDimK = 16; // Tile size for dimension K
-static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M
-static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N
-static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here)
-static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here)
-static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression
-static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression
+template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
+
+static const Index TileSizeDimM = 32ul; // Tile size for dimension M
+static const Index TileSizeDimN = 32ul; // Tile size for dimension N
+static const Index TileSizeDimK = 16ul; // Tile size for dimension K
+static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M
+static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N
+static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here)
+static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here)
+static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression
+static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression
// RoundUp function to make sure that the global threadId is divisable by local threadId
-static int RoundUp(int x, int y) {
+static Index RoundUp(Index x, Index y) {
return ((((x) + (y) - 1) / (y))*(y));
}
-template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
+template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
- // create a tuple of accessors from Evaluator
+
typedef typename Self::XprType HostExpr;
- // typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- // typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
- auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
- typedef decltype(functors) FunctorExpr;
+ typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+ typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+ typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
+ typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
+ // extract lhs functor list
+ LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+ // extract rhs functor list
+ RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
+
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
- typedef decltype(tuple_of_accessors) TupleType;
+ /// work-around for gcc bug
+ typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
+ /// work-around for gcc bug
+ typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
+ // create lhs tuple of accessors
+ LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
+ // create rhs tuple of accessors
+ RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
+
// Local memory for elements of Lhs
typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
// Local memory for elements of Rhs
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
+
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
//OutScalar memory
- auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
- typedef decltype(out_res) OutAccessor;
+ OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
+
// sycl parallel for
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
- KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
+ KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
- WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
+ WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
- m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
+ m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
});
self.device().asynchronousExec();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 860a6949a..b29968b63 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ /// required by sycl in order to extract the sycl accessor
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
protected:
template <int LoadMode, bool ActuallyVectorize>
struct PacketConv {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index abdf742c6..378f5cccb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -100,7 +100,7 @@ class IndexMapper {
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
- if (i + 1 < offset) {
+ if (static_cast<size_t>(i + 1) < offset) {
m_cudaInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
m_cudaOutputStrides[i] =
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 000000000..4247c1c4a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,476 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel1D{
+typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize, range_x, range_y;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+ Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
+ Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
+ buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+ void operator()(cl::sycl::nd_item<2> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+ auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+ auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+ auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+
+ const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
+ const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+ const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
+ /// fill the shared memory
+ for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + plane_kernel_offset ;
+ const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
+ if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
+ local_acc[local_index] = device_evaluator.coeff(tensor_index);
+ }
+ else local_acc[local_index]=0.0f;
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // calculate the convolution
+ const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+ if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+ const size_t index = plane_kernel_offset+ itemID.get_local(0);
+ for (size_t k = 0; k < kernelSize; ++k) {
+ result += (local_acc[k + index] * kernel_ptr[k]);
+ }
+ const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+ +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
+ buffer_ptr[tensor_index] = result;
+ }
+ }
+};
+
+
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel2D{
+typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+ Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
+ Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
+ buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+ void operator()(cl::sycl::nd_item<3> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+ auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+ auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+ auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+ const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
+ const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
+
+ /// fill the shared memory
+ const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+ const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
+ for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+ const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
+ for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + local_input_offset;
+ const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
+ if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
+ local_acc[local_index] = device_evaluator.coeff(tensor_index);
+ }
+ else local_acc[local_index]=0.0f;
+ }
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // calculate the convolution
+ const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+ const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
+ if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+ for (size_t j = 0; j < kernelSize_y; j++) {
+ size_t kernel_offset =kernelSize_x * j;
+ const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
+ for (size_t i = 0; i < kernelSize_x; i++) {
+ result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
+ }
+ }
+ const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+ +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
+ buffer_ptr[tensor_index] = result;
+ }
+ }
+};
+
+
+
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel3D{
+typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+ Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
+ const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
+ Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
+ kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
+ buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+ void operator()(cl::sycl::nd_item<3> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+ auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+ auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+ auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+ const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
+ const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+ const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
+ const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
+ for(size_t p=0; p<numP; p++){
+ /// fill the shared memory
+ const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
+ for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+ for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
+ const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
+ if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
+ local_acc[local_index] = device_evaluator.coeff(tensor_index);
+ }
+ else local_acc[local_index]=0.0f;
+ }
+ }
+ }
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // calculate the convolution
+ const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
+ const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
+ const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
+
+ if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+ for (size_t k = 0; k < kernelSize_z; k++) {
+ for (size_t j = 0; j < kernelSize_y; j++) {
+ for (size_t i = 0; i < kernelSize_x; i++) {
+ const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
+ const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
+ result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+ }
+ }
+ }
+ const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+ +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
+ buffer_ptr[tensor_index] = result;
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ }
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
+ typedef const Eigen::SyclDevice Device;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
+ PacketAccess = false,
+ Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ preloadKernel();
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ executeEval(data);
+ return false;
+ } else {
+ m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+ executeEval(m_buf);
+ return true;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_buf) {
+ m_device.deallocate(m_buf);
+ m_buf = NULL;
+ }
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const {
+ typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
+ typedef typename InputEvaluator::Dimensions InputDims;
+
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
+ // extract input functor list
+ InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+
+
+ m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
+ /// work-around for gcc 4.8 auto bug
+ typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
+ // create input tuple of accessors
+ InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
+
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
+ OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
+ KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
+
+ switch (NumKernelDims) {
+ case 1: {
+ const size_t numX = dimensions()[m_indices[0]];
+ const size_t numP = dimensions().TotalSize() / numX;
+ const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+ size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
+ m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
+ const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
+ assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+ auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
+ auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
+ InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+ const array<Index, 1> indices{{m_indices[0]}};
+ const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+ internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
+ EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+ InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+ indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ break;
+ }
+
+ case 2: {
+ const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+ const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+ const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
+ const size_t numX = dimensions()[m_indices[idxX]];
+ const size_t numY = dimensions()[m_indices[idxY]];
+ const size_t numP = dimensions().TotalSize() / (numX*numY);
+ size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
+ m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
+ const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
+ assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+ auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
+ auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
+ InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+ const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
+ const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
+ internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
+ EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+ InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+ indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ break;
+ }
+
+ case 3: {
+ const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+ const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+ const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+ const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
+ const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+ const size_t numX = dimensions()[m_indices[idxX]];
+ const size_t numY = dimensions()[m_indices[idxY]];
+ const size_t numZ = dimensions()[m_indices[idxZ]];
+ const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
+ const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
+ const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
+ internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
+ m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
+ const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
+ assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+ auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
+ auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
+ InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+ cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
+ EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+ InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+ indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
+ numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ break;
+ }
+
+ default: {
+ EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+ }
+ }
+ });
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return m_buf[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+ // model.
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
+ private:
+ // No assignment (copies are needed by the kernels)
+ TensorEvaluator& operator = (const TensorEvaluator&);
+ TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
+ KernelArgType m_kernelArg;
+ TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
+ Indices m_indices;
+ Dimensions m_dimensions;
+ Scalar* m_buf;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+ const Eigen::SyclDevice& m_device;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 16bbbf894..e209799bb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -19,12 +19,9 @@ namespace Eigen {
#define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
- template <typename Scalar> class MemCopyFunctor {
+ template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
public:
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-
- MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
+ MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
void operator()(cl::sycl::nd_item<1> itemID) {
auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
@@ -62,7 +59,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp
+ if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
it=devices.erase(it);
}
else{
@@ -133,11 +130,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = buffer_map.find(static_cast<const uint8_t*>(p));
if (it != buffer_map.end()) {
- auto num_bytes =it->second.get_size();
buffer_map.erase(it);
- // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version
- std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1>
- a1.deallocate(static_cast<uint8_t*>(p), num_bytes);
}
}
@@ -158,7 +151,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
}
}
- std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl;
+ std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
abort();
}
@@ -197,7 +190,12 @@ struct SyclDevice {
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
+ tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
+ auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
+ std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+ if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
+ }
rng = n;
if (rng==0) rng=static_cast<Index>(1);
GRange=rng;
@@ -207,6 +205,74 @@ struct SyclDevice {
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
}
}
+
+ /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+ template<typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
+ Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+ if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+ }
+ Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+ tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+ rng1=dim1;
+ if (rng1==0 ) rng1=static_cast<Index>(1);
+ GRange1=rng1;
+ if (tileSize1>GRange1) tileSize1=GRange1;
+ else if(GRange1>tileSize1){
+ Index xMode = static_cast<Index>(GRange1 % tileSize1);
+ if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ }
+ tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
+ rng0 = dim0;
+ if (rng0==0 ) rng0=static_cast<Index>(1);
+ GRange0=rng0;
+ if (tileSize0>GRange0) tileSize0=GRange0;
+ else if(GRange0>tileSize0){
+ Index xMode = static_cast<Index>(GRange0 % tileSize0);
+ if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ }
+ }
+
+
+
+ /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+ template<typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
+ Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+ if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+ }
+ Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+ tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
+ rng2=dim2;
+ if (rng2==0 ) rng1=static_cast<Index>(1);
+ GRange2=rng2;
+ if (tileSize2>GRange2) tileSize2=GRange2;
+ else if(GRange2>tileSize2){
+ Index xMode = static_cast<Index>(GRange2 % tileSize2);
+ if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+ }
+ pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
+ tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+ rng1=dim1;
+ if (rng1==0 ) rng1=static_cast<Index>(1);
+ GRange1=rng1;
+ if (tileSize1>GRange1) tileSize1=GRange1;
+ else if(GRange1>tileSize1){
+ Index xMode = static_cast<Index>(GRange1 % tileSize1);
+ if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ }
+ tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
+ rng0 = dim0;
+ if (rng0==0 ) rng0=static_cast<Index>(1);
+ GRange0=rng0;
+ if (tileSize0>GRange0) tileSize0=GRange0;
+ else if(GRange0>tileSize0){
+ Index xMode = static_cast<Index>(GRange0 % tileSize0);
+ if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ }
+ }
/// allocate device memory
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
return m_queue_stream->allocate(num_bytes);
@@ -220,21 +286,23 @@ struct SyclDevice {
EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
/// the memcpy function
- template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const {
- auto it1 = m_queue_stream->find_buffer((void*)src);
+ template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+ auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
auto it2 = m_queue_stream->find_buffer(dst);
auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
- offset/=sizeof(T);
- i/=sizeof(T);
+ offset/=sizeof(Index);
+ i/=sizeof(Index);
size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
+ parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset));
+ auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+ typedef decltype(src_acc) read_accessor;
+ typedef decltype(dst_acc) write_accessor;
+ cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
});
- asynchronousExec();
+ synchronize();
}
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
@@ -242,7 +310,7 @@ struct SyclDevice {
/// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
/// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
/// this buffer is accessed, the data will be copied to the device.
- template<typename T> EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+ template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
::memcpy(host_acc.get_pointer(), src, n);
}
@@ -252,20 +320,22 @@ struct SyclDevice {
/// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
/// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
/// to the cpu only once per function call.
- template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const {
+ template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
auto it = m_queue_stream->find_buffer(src);
auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
- offset/=sizeof(T);
+ offset/=sizeof(Index);
size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
+ parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
// Assuming that the dst is the start of the destination pointer
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
+ typedef decltype(src_acc) read_accessor;
+ typedef decltype(dst_acc) write_accessor;
+ cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
});
- asynchronousExec();
+ synchronize();
}
/// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@@ -274,7 +344,7 @@ struct SyclDevice {
size_t rng, GRange, tileSize;
parallel_for_setup(n, tileSize, rng, GRange);
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
- asynchronousExec();
+ synchronize();
}
struct memsetCghFunctor{
@@ -300,6 +370,24 @@ struct SyclDevice {
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
}
+ EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+ return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+ // return stream_->deviceProperties().multiProcessorCount;
+ }
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+ return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+
+ // return stream_->deviceProperties().maxThreadsPerBlock;
+ }
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+ // OpenCL doesnot have such concept
+ return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+ // return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+ }
+ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+ return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+ // return stream_->deviceProperties().sharedMemPerBlock;
+ }
/// No need for sycl it should act the same as CPU version
EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
@@ -307,9 +395,12 @@ struct SyclDevice {
sycl_queue().wait_and_throw(); //pass
}
- EIGEN_STRONG_INLINE void asynchronousExec() const {
- sycl_queue().throw_asynchronous();//pass
- }
+ EIGEN_STRONG_INLINE void asynchronousExec() const {
+ ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
+ //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
+ sycl_queue().wait_and_throw(); //pass
+
+ }
// This function checks if the runtime recorded an error for the
// underlying stream device.
EIGEN_STRONG_INLINE bool ok() const {
@@ -318,6 +409,7 @@ struct SyclDevice {
};
+
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 930837021..abe85c860 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -26,8 +26,8 @@ namespace Eigen {
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
/// existing code as its default value is T*.
namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
@@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
enum {
Flags = 0
};
- template <class T> struct MakePointer {
- // Intermediate typedef to workaround MSVC issue.
- typedef MakePointer_<T> MakePointerT;
- typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
-
- };
};
-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
{
- typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+ typedef const TensorForcedEvalOp<XprType>& type;
};
-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
{
- typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+ typedef TensorForcedEvalOp<XprType> type;
};
} // end namespace internal
-template<typename XprType, template <class> class MakePointer_>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
};
-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
{
- typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+ typedef TensorForcedEvalOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef typename XprType::Index Index;
@@ -150,17 +143,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
/// required by sycl in order to extract the sycl accessor
- const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
/// used by sycl in order to build the sycl buffer
- const Device& device() const{return m_device;}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device& m_device;
- typename MakePointer<CoeffReturnType>::Type m_buffer;
+ CoeffReturnType* m_buffer;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 9a012c176..2e638992a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+template<typename XprType> class TensorForcedEvalOp;
template<typename ExpressionType, typename DeviceType> class TensorDevice;
template<typename Derived, typename Device> struct TensorEvaluator;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 485a082e2..ef1c9c42c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef __CUDA_ARCH__
return (__umulhi(magic, n) >> shift);
+#elif defined(__SYCL_DEVICE_ONLY__)
+ return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
#else
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d582ccbe1..6ddd2ca18 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
{
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
static const int NumDims = internal::array_size<Strides>::value;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -733,7 +739,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
}else{
- /* implies m_strides[i]<0 by assert */
+ /* implies m_strides[i]<0 by assert */
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
}
@@ -796,13 +802,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
sizeof(Scalar));
}
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef Strides Dimensions;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -858,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));
+#else
+ return cl::sycl::clamp(value, min, max);
+#endif
}
array<Index, NumDims> m_outputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c9912d9d4..c3ca129e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,11 +25,11 @@
namespace Eigen {
namespace internal {
-template<typename CoeffReturnType> struct syclGenericBufferReducer{
+template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
@@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local));
+ h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@@ -54,11 +54,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
length = length / local;
} while (length > 1);
+}
+};
-
+template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
+template<typename BufferTOut, typename BufferTIn>
+static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+ syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
+ bufOut, bufI, dev, length, local);
}
-
};
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
@@ -74,8 +79,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- auto functors = TensorSycl::internal::extractFunctors(self.impl());
- typedef decltype(functors) FunctorExpr;
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
+ FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
size_t inputSize =self.impl().dimensions().TotalSize();
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@@ -108,9 +113,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// Dims dims= self.xprDims();
//Op functor = reducer;
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+ // this is a workaround for gcc 4.8 bug
+ typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
// create a tuple of accessors from Evaluator
- auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- typedef decltype(tuple_of_accessors) TupleType;
+ TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
typedef decltype(tmp_global_accessor) OutAccessor;
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
@@ -122,7 +128,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
/// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+ syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
}
};
@@ -134,10 +140,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
typedef typename Self::CoeffReturnType CoeffReturnType;
static const bool HasOptimizedImplementation = false;
- static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
+ static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- auto functors = TensorSycl::internal::extractFunctors(self.impl());
- typedef decltype(functors) FunctorExpr;
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
+ FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
typename Self::Index range, GRange, tileSize;
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
@@ -147,14 +153,15 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
/// recursively apply reduction on it in order to reduce the whole.
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+ // this is workaround for gcc 4.8 bug.
+ typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
// create a tuple of accessors from Evaluator
- auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
+ Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
-
+ Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
+ (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
});
dev.asynchronousExec();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14e392e36..e430b0826 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ /// required by sycl in order to extract the accessor
+ const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
+ /// added for sycl in order to construct the buffer from sycl device
+ ReverseDimensions functor() const { return m_reverse; }
+
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 6c35bfdb6..2237140e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device)
+ : m_impl(op.expression(), device), m_strides(op.strides())
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
- m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+ m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ /// required by sycl in order to extract the accessor
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ /// required by sycl in order to extract the accessor
+ Strides functor() const { return m_strides; }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -250,9 +255,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
+ const Strides m_strides;
};
-
// Eval as lvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -286,6 +291,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
return this->m_impl.coeffRef(this->srcCoeff(index));
}
+ /// required by sycl in order to extract the accessor
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
+ /// required by sycl in order to extract the accessor
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
+
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 2e61ee049..9d5a6d4c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -35,7 +35,7 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+ template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
@@ -80,6 +80,9 @@ template<typename T> struct GetType<false, T>{
/// this is used for extracting tensor reduction
#include "TensorReductionSycl.h"
+/// this is used for extracting tensor convolution
+#include "TensorConvolutionSycl.h"
+
// kernel execution using fusion
#include "TensorSyclRun.h"
//sycl functors
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index 113dd2557..c0bcf26cd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -97,8 +97,18 @@ template <typename Expr>\
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
: DeviceConvertor<ExprNode, Res, Expr>{};
-KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
-KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
+#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
+template <typename Expr>\
+struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
+ typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
+};
+KERNELBROKERCONVERTFORCEDEVAL(const)
+KERNELBROKERCONVERTFORCEDEVAL()
+#undef KERNELBROKERCONVERTFORCEDEVAL
+
+
+
KERNELBROKERCONVERT(const, true, TensorEvalToOp)
KERNELBROKERCONVERT(, false, TensorEvalToOp)
#undef KERNELBROKERCONVERT
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index df1a732e7..56ba82805 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
ASSIGN(const)
ASSIGN()
#undef ASSIGN
+
+
+
+
+ /// specialisation of the \ref ExprConstructor struct when the node type is
+ /// const TensorAssignOp
+ #define CONVERSIONEXPRCONST(CVQual)\
+ template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
+ struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
+ typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
+ typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\
+ my_nested_type nestedExpr;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+ : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
+ };
+
+ CONVERSIONEXPRCONST(const)
+ CONVERSIONEXPRCONST()
+ #undef CONVERSIONEXPRCONST
+
/// specialisation of the \ref ExprConstructor struct when the node type is
/// TensorEvalToOp /// 0 here is the output number in the buffer
#define EVALTO(CVQual)\
@@ -212,10 +234,10 @@ EVALTO()
/// TensorForcedEvalOp
#define FORCEDEVAL(CVQual)\
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
- TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+ typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
+ TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR()
#undef SYCLREDUCTIONEXPR
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorContractionOp
+#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
+struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
+CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
+ static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
+ typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
+ NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
+ typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
+ Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+};
+
+SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTION
+
+
#define SYCLSLICEOPEXPR(CVQual)\
template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index 876fcd45e..3fd607941 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -35,6 +35,8 @@
namespace Eigen {
namespace TensorSycl {
namespace internal {
+#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
+
/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
/// accessor from a buffer.
/// Depending on the type of the leaf node we can get a read accessor or a
@@ -44,22 +46,16 @@ struct ExtractAccessor;
struct AccessorConstructor{
template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
- -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
- return ExtractAccessor<Arg>::getTuple(cgh, eval);
- }
+ RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
- -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
- return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
- }
+ RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
+
template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
- -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
- return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
- }
+ RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
+
template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
- -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){
- return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()));
- }
+ RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
};
/// specialisation of the \ref ExtractAccessor struct when the node type is
@@ -68,9 +64,7 @@ struct AccessorConstructor{
template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
- return AccessorConstructor::getTuple(cgh, eval.impl());\
- }\
+RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
};
SYCLUNARYCATEGORYEXTACC(const)
@@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC()
template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
- return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
- }\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
};
SYCLBINARYCATEGORYEXTACC(const)
@@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC()
template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\
- return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\
- }\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
};
SYCLTERNARYCATEGORYEXTACC(const)
@@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC()
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\
- return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\
- }\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
};
SYCLSELECTOPEXTACC(const)
@@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC()
template <typename LHSExpr, typename RHSExpr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
- return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
- }\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
};
SYCLTENSORASSIGNOPEXTACC(const)
@@ -142,9 +128,7 @@ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>,
template <typename PlainObjectType, int Options_, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
- -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
- return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
- }\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
};
TENSORMAPEXPR(const, cl::sycl::access::mode::read)
@@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
template <typename Expr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
- -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
- return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
- }\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
SYCLFORCEDEVALEXTACC(const)
@@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC()
template <typename Expr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
- -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\
- return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\
- }\
+ RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
};
SYCLEVALTOEXTACC(const)
@@ -185,37 +165,47 @@ SYCLEVALTOEXTACC()
template <typename OP, typename Dim, typename Expr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
- -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
- return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
- }\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
SYCLREDUCTIONEXTACC(const)
SYCLREDUCTIONEXTACC()
#undef SYCLREDUCTIONEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
+#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
+ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
+
+
/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp. This is a special case where there is no OP
+/// const TensorSlicingOp.
#define SYCLSLICEOPEXTACC(CVQual)\
template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
- return AccessorConstructor::getTuple(cgh, eval.impl());\
- }\
+ RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
};
SYCLSLICEOPEXTACC(const)
SYCLSLICEOPEXTACC()
#undef SYCLSLICEOPEXTACC
-
+// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorStridingSlicingOp.
#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
- -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
- return AccessorConstructor::getTuple(cgh, eval.impl());\
- }\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
};
SYCLSLICESTRIDEOPEXTACC(const)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index 6f9ab57af..e26cbdf6d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{
};
+/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
+///TensorConversionOp
+#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
+template <typename ArgType1, typename ArgType2, typename Dev>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
+ FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
+ FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
+ : subExpr(expr.impl()) {}\
+};
+
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
+#undef SYCLEXTRFUNCCONVERSION
+
#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
@@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const)
SYCLEXTRFUNCREDUCTIONOP()
#undef SYCLEXTRFUNCREDUCTIONOP
+#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
+ typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
+ typedef typename Evaluator::Dimensions Dimensions;\
+ const Dimensions m_dimensions;\
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+ FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
+ : m_dimensions(expr.dimensions()) {}\
+};
+
+
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
+#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
+
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
#define SYCLEXTRFUNCTSLICEOP(CVQual)\
@@ -253,9 +285,6 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>,
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
};
-// TensorContractionOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
// TensorConcatenationOp
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 710e22474..2f7779036 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -18,13 +18,14 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+ template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+ OP op;
OutputAccessor aOut;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
- GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+ GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+ : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
@@ -44,7 +45,12 @@ namespace internal {
auto min = (length < local) ? length : local;
for (size_t offset = min / 2; offset > 0; offset /= 2) {
if (localid < offset) {
- scratch[localid] += scratch[localid + offset];
+ auto accum = op.initialize();
+ op.reduce(scratch[localid], &accum);
+ op.reduce(scratch[localid + offset], &accum);
+ op.finalize(accum);
+ scratch[localid]=accum;
+ //scratch[localid] += scratch[localid + offset];
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
@@ -66,7 +72,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
+ ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
@@ -99,6 +105,46 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
Index range;
};
+template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
+class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
+ public:
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+ typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
+ ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
+ Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
+ :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
+ void operator()(cl::sycl::nd_item<1> itemID) {
+
+ typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=static_cast<Index>(itemID.get_global_linear_id());
+ if (globalid< range) {
+ typename DeviceSelf::CoeffReturnType accum = functor.initialize();
+ Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
+ functor.finalize(accum);
+ output_accessor_ptr[globalid]= accum/num_values_to_reduce;
+ }
+ }
+ private:
+ write_accessor output_accessor;
+ FunctorExpr functors;
+ Tuple_of_Acc tuple_of_accessors;
+ Dims dims;
+ Op functor;
+ Index range;
+ Index num_values_to_reduce;
+};
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
class FullReductionKernelFunctor{
@@ -128,18 +174,70 @@ public:
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id();
- if(globalid<rng)
- tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
- else
- tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+ tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
+ : static_cast<CoeffReturnType>(op.initialize());
- if(remaining!=0 && globalid==0 )
+ if(remaining!=0 && globalid==0 ){
// this will add the rest of input buffer when the input size is not devidable to red_factor.
- tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+ auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
+ reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+ auto accum = op.initialize();
+ op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+ op.reduce(remaining_reduce, &accum);
+ op.finalize(accum);
+ tmp_global_accessor.get_pointer()[0]=accum;
+
+ }
}
};
+template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
+class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
+public:
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+
+ OutAccessor tmp_global_accessor;
+ Index rng , remaining, red_factor;
+ Op op;
+ Dims dims;
+ FunctorExpr functors;
+ TupleType tuple_of_accessors;
+
+ FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
+ :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=itemID.get_global_linear_id();
+ auto scale = (rng*red_factor) + remaining;
+
+ tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
+ :static_cast<CoeffReturnType>(op.initialize())/scale;
+
+ if(remaining!=0 && globalid==0 ){
+ // this will add the rest of input buffer when the input size is not devidable to red_factor.
+ auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+ auto accum = op.initialize();
+ tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
+ op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+ op.reduce(remaining_reduce, &accum);
+ op.finalize(accum);
+ tmp_global_accessor.get_pointer()[0]=accum/scale;
+ }
+ }
+};
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index 37fe196ea..0ac51e7bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const)
REDUCTIONLEAFCOUNT()
#undef REDUCTIONLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
+#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
+ static const size_t Count =1;\
+};
+
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
+#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
+
+
+
/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
#define SLICEOPLEAFCOUNT(CVQual)\
template <typename StartIndices, typename Sizes, typename XprType>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 4419a1780..f6e3b4766 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -169,6 +169,20 @@ SYCLREDUCTION()
/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorReductionOp
+#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
+ typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
+};
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONPLH
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorCwiseSelectOp
#define SLICEOPEXPR(CVQual)\
template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index 32930be26..cac785540 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -49,19 +49,38 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
/// based expression tree;
/// creates the expression tree for the device with accessor to buffers;
/// construct the kernel and submit it to the sycl queue.
+/// std::array does not have TotalSize. So I have to get the size through template specialisation.
+template<typename , typename Dimensions> struct DimensionSize{
+ static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
+ return dim.TotalSize();
+ }
+};
+#define DIMSIZEMACRO(CVQual)\
+template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
+ static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
+ return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
+ }\
+};
+
+DIMSIZEMACRO(const)
+DIMSIZEMACRO()
+#undef DIMSIZEMACRO
+
+
template <typename Expr, typename Dev>
void run(Expr &expr, Dev &dev) {
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr;
+ typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
FunctorExpr functors = internal::extractFunctors(evaluator);
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
// create a tuple of accessors from Evaluator
- typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType;
- TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+ typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
+ TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
typename Expr::Index range, GRange, tileSize;
- dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
+ typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
+ dev.parallel_for_setup(total_size, tileSize, range, GRange);
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index a087f4759..9fa479f52 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -163,6 +163,9 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
+ ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
+ ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11")
+ ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index c426549f1..21fdfca22 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -131,11 +131,6 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
QueueInterface queueInterface(d);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
-
- test_broadcast_sycl_fixed<DataType, RowMajor, int>(sycl_device);
- test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
- test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
- test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index d5193d1ea..400a31d09 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -14,7 +14,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -32,20 +32,20 @@ template <typename T> T cube(T x) { return x * x * x; }
template <typename T> T inverse(T x) { return 1 / x; }
}
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR) \
+#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \
{ \
/* out OPERATOR in.FUNC() */ \
- Tensor<SCALAR, 3> in(tensorRange); \
- Tensor<SCALAR, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
in = in.random() + static_cast<SCALAR>(0.01); \
out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3> reference(out); \
+ Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
SCALAR *gpu_data = static_cast<SCALAR *>( \
sycl_device.allocate(in.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_out = static_cast<SCALAR *>( \
sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
(in.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
@@ -53,7 +53,7 @@ template <typename T> T inverse(T x) { return 1 / x; }
gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
SCALAR ver = reference(i); \
ver OPERATOR std::FUNC(in(i)); \
VERIFY_IS_APPROX(out(i), ver); \
@@ -63,18 +63,18 @@ template <typename T> T inverse(T x) { return 1 / x; }
} \
{ \
/* out OPERATOR out.FUNC() */ \
- Tensor<SCALAR, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3> reference(out); \
+ Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
SCALAR *gpu_data_out = static_cast<SCALAR *>( \
sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
(out.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
SCALAR ver = reference(i); \
ver OPERATOR std::FUNC(reference(i)); \
VERIFY_IS_APPROX(out(i), ver); \
@@ -82,61 +82,62 @@ template <typename T> T inverse(T x) { return 1 / x; }
sycl_device.deallocate(gpu_data_out); \
}
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR)
+#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \
+ TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC) \
+#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \
{ \
/* out = in.FUNC() */ \
- Tensor<SCALAR, 3> in(tensorRange); \
- Tensor<bool, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
+ Tensor<bool, 3, Layout, int64_t> out(tensorRange); \
in = in.random() + static_cast<SCALAR>(0.01); \
SCALAR *gpu_data = static_cast<SCALAR *>( \
sycl_device.allocate(in.size() * sizeof(SCALAR))); \
bool *gpu_data_out = \
static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \
- TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<bool, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
+ TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
(in.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) = gpu.FUNC(); \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(bool)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \
} \
sycl_device.deallocate(gpu_data); \
sycl_device.deallocate(gpu_data_out); \
}
-#define TEST_UNARY_BUILTINS(SCALAR) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf)
+#define TEST_UNARY_BUILTINS(SCALAR, Layout) \
+ TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \
+ TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \
+ TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \
+ TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \
+ TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 10;
- int sizeDim2 = 10;
- int sizeDim3 = 10;
- array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- TEST_UNARY_BUILTINS(float)
+ TEST_UNARY_BUILTINS(float, RowMajor)
+ TEST_UNARY_BUILTINS(float, ColMajor)
}
namespace std {
@@ -144,24 +145,24 @@ template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
}
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC) \
+#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \
{ \
/* out = in_1.FUNC(in_2) */ \
- Tensor<SCALAR, 3> in_1(tensorRange); \
- Tensor<SCALAR, 3> in_2(tensorRange); \
- Tensor<SCALAR, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3> reference(out); \
+ Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_out = static_cast<SCALAR *>( \
sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
(in_1.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
@@ -169,7 +170,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
SCALAR ver = reference(i); \
ver = std::FUNC(in_1(i), in_2(i)); \
VERIFY_IS_APPROX(out(i), ver); \
@@ -179,24 +180,24 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
sycl_device.deallocate(gpu_data_out); \
}
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR) \
+#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \
{ \
/* out = in_1 OPERATOR in_2 */ \
- Tensor<SCALAR, 3> in_1(tensorRange); \
- Tensor<SCALAR, 3> in_2(tensorRange); \
- Tensor<SCALAR, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3> reference(out); \
+ Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_out = static_cast<SCALAR *>( \
sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
(in_1.size()) * sizeof(SCALAR)); \
sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
@@ -204,7 +205,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \
} \
sycl_device.deallocate(gpu_data_1); \
@@ -212,46 +213,48 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
sycl_device.deallocate(gpu_data_out); \
}
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR) \
+#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \
{ \
/* out = in_1 OPERATOR 2 */ \
- Tensor<SCALAR, 3> in_1(tensorRange); \
- Tensor<SCALAR, 3> out(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
+ Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3> reference(out); \
+ Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
SCALAR *gpu_data_out = static_cast<SCALAR *>( \
sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
+ TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
(in_1.size()) * sizeof(SCALAR)); \
gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \
sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
(out.size()) * sizeof(SCALAR)); \
- for (int i = 0; i < out.size(); ++i) { \
+ for (int64_t i = 0; i < out.size(); ++i) { \
VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \
} \
sycl_device.deallocate(gpu_data_1); \
sycl_device.deallocate(gpu_data_out); \
}
-#define TEST_BINARY_BUILTINS(SCALAR) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /)
+#define TEST_BINARY_BUILTINS(SCALAR, Layout) \
+ TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \
+ TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \
+ TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \
+ TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \
+ TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \
+ TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 10;
- int sizeDim2 = 10;
- int sizeDim3 = 10;
- array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- TEST_BINARY_BUILTINS(float)
- TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %)
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ TEST_BINARY_BUILTINS(float, RowMajor)
+ TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
+ TEST_BINARY_BUILTINS(float, ColMajor)
+ TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
}
void test_cxx11_tensor_builtins_sycl() {
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index 5a324b44c..e3023a368 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -14,7 +14,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -22,39 +22,39 @@
using Eigen::Tensor;
-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
{
- Index leftDim1 = 2;
- Index leftDim2 = 3;
- Index leftDim3 = 1;
- Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
- Index rightDim1 = 2;
- Index rightDim2 = 3;
- Index rightDim3 = 1;
- Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
-
- //Index concatDim1 = 3;
-// Index concatDim2 = 3;
-// Index concatDim3 = 1;
- //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
-
- Tensor<DataType, 3, DataLayout, Index> left(leftRange);
- Tensor<DataType, 3, DataLayout, Index> right(rightRange);
+ IndexType leftDim1 = 2;
+ IndexType leftDim2 = 3;
+ IndexType leftDim3 = 1;
+ Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+ IndexType rightDim1 = 2;
+ IndexType rightDim2 = 3;
+ IndexType rightDim3 = 1;
+ Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+ //IndexType concatDim1 = 3;
+// IndexType concatDim2 = 3;
+// IndexType concatDim3 = 1;
+ //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+ Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
left.setRandom();
right.setRandom();
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
///
- Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
//concatenation = left.concatenate(right, 0);
gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
@@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
- for (int j = 0; j < 3; ++j) {
- for (int i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType i = 0; i < 2; ++i) {
VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
}
- for (int i = 2; i < 4; ++i) {
+ for (IndexType i = 2; i < 4; ++i) {
VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
}
}
sycl_device.deallocate(gpu_out_data1);
- Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
@@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
}
- for (int j = 3; j < 6; ++j) {
+ for (IndexType j = 3; j < 6; ++j) {
VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
}
}
sycl_device.deallocate(gpu_out_data2);
- Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
@@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
}
@@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
sycl_device.deallocate(gpu_in1_data);
sycl_device.deallocate(gpu_in2_data);
}
-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
{
- Index leftDim1 = 2;
- Index leftDim2 = 3;
- Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}};
+ IndexType leftDim1 = 2;
+ IndexType leftDim2 = 3;
+ Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
- Index rightDim1 = 2;
- Index rightDim2 = 3;
- Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}};
+ IndexType rightDim1 = 2;
+ IndexType rightDim2 = 3;
+ Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
- Index concatDim1 = 4;
- Index concatDim2 = 3;
- Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}};
+ IndexType concatDim1 = 4;
+ IndexType concatDim2 = 3;
+ Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
- Tensor<DataType, 2, DataLayout, Index> left(leftRange);
- Tensor<DataType, 2, DataLayout, Index> right(rightRange);
- Tensor<DataType, 2, DataLayout, Index> result(resRange);
+ Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+ Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+ Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
left.setRandom();
right.setRandom();
@@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
@@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(left(i, j), result(i, j));
VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
}
@@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_concatenation<DataType, RowMajor, int>(sycl_device);
- test_simple_concatenation<DataType, ColMajor, int>(sycl_device);
- test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device);
+ test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+ test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_concatenation_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
index 0221da110..5bace66c5 100644
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -14,7 +14,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include <iostream>
@@ -28,104 +28,172 @@ using Eigen::array;
using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-template<int DataLayout, typename Device>
-void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
{
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+ static const DataType error_threshold =1e-4f;
// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 2, DataLayout> t_result(m_size, n_size);
- Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
// Eigen::array<DimPair, 1> dims(DimPair(1, 0));
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
- Eigen::array<int, 2> left_dims = {{m_size, k_size}};
- Eigen::array<int, 2> right_dims = {{k_size, n_size}};
- Eigen::array<int, 2> result_dims = {{m_size, n_size}};
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+ Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
t_left.setRandom();
t_right.setRandom();
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
- float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
- float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
- float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
+ DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+ DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+ DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+
t_result = t_left.contract(t_right, dims);
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+ continue;
+ }
+ std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ assert(false);
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void test_TF(const Device& sycl_device)
+{
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+ static const DataType error_threshold =1e-4f;
+ Eigen::array<IndexType, 2> left_dims = {{2, 3}};
+ Eigen::array<IndexType, 2> right_dims = {{3, 1}};
+ Eigen::array<IndexType, 2> res_dims = {{2, 1}};
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+
+
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+ t_left.data()[0] = 1.0f;
+ t_left.data()[1] = 2.0f;
+ t_left.data()[2] = 3.0f;
+ t_left.data()[3] = 4.0f;
+ t_left.data()[4] = 5.0f;
+ t_left.data()[5] = 6.0f;
+
+ t_right.data()[0] = -1.0f;
+ t_right.data()[1] = 0.5f;
+ t_right.data()[2] = 2.0f;
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size()*sizeof(DataType);
+
+
+ DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+ DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+ DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+ t_result = t_left.contract(t_right, dims);
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
continue;
}
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
continue;
}
- std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
<< " vs " << t_result_gpu(i) << std::endl;
assert(false);
}
sycl_device.deallocate(d_t_left);
sycl_device.deallocate(d_t_right);
sycl_device.deallocate(d_t_result);
-}
-template<int DataLayout, typename Device>
-void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
+}
+
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
{
//std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 0, DataLayout> t_result;
- Tensor<float, 0, DataLayout> t_result_gpu;
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+ static const DataType error_threshold =1e-4f;
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+ Tensor<DataType, 0, DataLayout, IndexType> t_result;
+ Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
- Eigen::array<int, 2> left_dims = {{m_size, k_size}};
- Eigen::array<int, 2> right_dims = {{k_size, n_size}};
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
t_left.setRandom();
t_right.setRandom();
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = sizeof(float);
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = sizeof(DataType);
- float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
- float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
- float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
+ DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+ DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+ DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
- Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+
t_result = t_left.contract(t_right, dims);
- sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
- if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
- !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+ if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
+ !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
std::cout << "mismatch detected: " << t_result()
<< " vs " << t_result_gpu() << std::endl;
assert(false);
@@ -137,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
}
-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
void test_sycl_contraction_m(const Device& sycl_device) {
- for (int k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128);
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
}
}
-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
void test_sycl_contraction_k(const Device& sycl_device) {
- for (int k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128);
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
}
}
-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
void test_sycl_contraction_n(const Device& sycl_device) {
- for (int k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k);
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
}
}
-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
void test_sycl_contraction_sizes(const Device& sycl_device) {
- int m_sizes[] = { 31, 39, 63, 64, 65,
+ IndexType m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025};
- int n_sizes[] = { 31, 39, 63, 64, 65,
+ IndexType n_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257, 511,
512, 513, 1023, 1024, 1025};
- int k_sizes[] = { 31, 39, 63, 64, 65,
+ IndexType k_sizes[] = { 31, 39, 63, 64, 65,
95, 96, 127, 129, 255,
257, 511, 512, 513, 1023,
1024, 1025};
- for (int i = 0; i < 15; i++) {
- for (int j = 0; j < 15; j++) {
- for (int k = 0; k < 17; k++) {
- test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+ for (IndexType i = 0; i < 15; i++) {
+ for (IndexType j = 0; j < 15; j++) {
+ for (IndexType k = 0; k < 17; k++) {
+ test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
}
}
}
@@ -186,24 +254,27 @@ void test_sycl_contraction_sizes(const Device& sycl_device) {
template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
QueueInterface queueInterface(s);
auto sycl_device=Eigen::SyclDevice(&queueInterface);
- test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32);
- test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32);
- test_scalar<ColMajor>(sycl_device, 32, 32, 32);
- test_scalar<RowMajor>(sycl_device, 32, 32, 32);
+ test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32);
+ test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
+ test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32);
+ test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now();
- test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128);
- test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128);
- test_scalar<ColMajor>(sycl_device, 128, 128, 128);
- test_scalar<RowMajor>(sycl_device, 128, 128, 128);
- test_sycl_contraction_m<ColMajor>(sycl_device);
- test_sycl_contraction_m<RowMajor>(sycl_device);
- test_sycl_contraction_n<ColMajor>(sycl_device);
- test_sycl_contraction_n<RowMajor>(sycl_device);
- test_sycl_contraction_k<ColMajor>(sycl_device);
- test_sycl_contraction_k<RowMajor>(sycl_device);
- test_sycl_contraction_sizes<ColMajor>(sycl_device);
- test_sycl_contraction_sizes<RowMajor>(sycl_device);
+ test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
+ test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
+ test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
+ test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
+ test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device);
+ test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device);
+ test_TF<RowMajor, float, int64_t>(sycl_device);
+ test_TF<ColMajor, float, int64_t>(sycl_device);
+
end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
@@ -211,6 +282,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s
<< "elapsed time: " << elapsed_seconds.count() << "s\n";
}
+
void test_cxx11_tensor_contract_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(tensorContractionPerDevice(device));
diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 000000000..a4226a63a
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=55;
+ IndexType outdim2=51;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 1> kernel_dims = {{4}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 1> dims3{{0}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=51;
+ IndexType outdim2=51;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 2> dims3{{0,1}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=51;
+ IndexType outdim2=49;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+ Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+ Eigen::array<IndexType, 1> kernel_dims = {{2}};
+ Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+ Eigen::array<IndexType, 1> dims3{{0}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0
+ VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2
+ VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4
+ VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1
+ VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3
+ VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+ Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+ Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+ Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+ input.setRandom();
+ kernel.setRandom();
+ Eigen::array<IndexType, 2> dims;
+ dims[0] = 0;
+ dims[1] = 1;
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+ input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+ VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+ input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+ VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+ input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+ VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+ input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+ input(0) = 1.0f;
+ input(1) = 2.0f;
+ input(2) = 3.0f;
+ kernel(0) = 0.5f;
+ kernel(1) = 1.0f;
+ kernel(2) = 0.0f;
+
+ Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+ // Emulate VALID mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(0, 0);
+ Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+ VERIFY_IS_EQUAL(valid.dimension(0), 1);
+ VERIFY_IS_APPROX(valid(0), 2.5f);
+
+ // Emulate SAME mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(1, 1);
+ Tensor<DataType, 1, DataLayout, IndexType> same(3);
+ std::size_t same_bytes = same.size() * sizeof(DataType);
+ DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+ gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+ VERIFY_IS_EQUAL(same.dimension(0), 3);
+ VERIFY_IS_APPROX(same(0), 1.0f);
+ VERIFY_IS_APPROX(same(1), 2.5f);
+ VERIFY_IS_APPROX(same(2), 4.0f);
+
+ // Emulate FULL mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(2, 2);
+
+ Tensor<DataType, 1, DataLayout, IndexType> full(5);
+ std::size_t full_bytes = full.size() * sizeof(DataType);
+ DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+ gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+ VERIFY_IS_EQUAL(full.dimension(0), 5);
+ VERIFY_IS_APPROX(full(0), 0.0f);
+ VERIFY_IS_APPROX(full(1), 1.0f);
+ VERIFY_IS_APPROX(full(2), 2.5f);
+ VERIFY_IS_APPROX(full(3), 4.0f);
+ VERIFY_IS_APPROX(full(4), 1.5f);
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_valid);
+ sycl_device.deallocate(d_same);
+ sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+ Eigen::array<IndexType, 1> input_dims = {{13}};
+ Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+ Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+ Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+ input.setRandom();
+ kernel.setRandom();
+ Eigen::array<IndexType, 1> dims;
+ dims[0] = 0;
+
+ Eigen::array<IndexType, 1> stride_of_3;
+ stride_of_3[0] = 3;
+ Eigen::array<IndexType, 1> stride_of_2;
+ stride_of_2[0] = 2;
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_EQUAL(result.dimension(0), 2);
+ VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+ input(6)*kernel(2)));
+ VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+ input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+ QueueInterface queueInterface(s);
+ auto sycl_device=Eigen::SyclDevice(&queueInterface);
+ test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+ test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+ test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+ test_evals<float, ColMajor, int64_t>(sycl_device);
+ test_evals<float, RowMajor, int64_t>(sycl_device);
+ test_expr<float, ColMajor, int64_t>(sycl_device);
+ test_expr<float, RowMajor, int64_t>(sycl_device);
+ test_modes<float, ColMajor, int64_t>(sycl_device);
+ test_modes<float, RowMajor, int64_t>(sycl_device);
+ test_strides<float, ColMajor, int64_t>(sycl_device);
+ test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_convolution_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorConvolutionPerDevice(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 190dba862..3ecc68df0 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -14,7 +14,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -22,35 +22,35 @@
#include <stdint.h>
#include <iostream>
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
std::cout << "Running on : "
<< sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
<<std::endl;
- int sizeDim1 = 100;
- array<int, 1> tensorRange = {{sizeDim1}};
- Tensor<DataType, 1, DataLayout> in(tensorRange);
- Tensor<DataType, 1, DataLayout> in1(tensorRange);
+ IndexType sizeDim1 = 100;
+ array<IndexType, 1> tensorRange = {{sizeDim1}};
+ Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+ Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
memset(in1.data(), 1, in1.size() * sizeof(DataType));
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
- for (int i=0; i<in.size(); i++) {
+ for (IndexType i=0; i<in.size(); i++) {
VERIFY_IS_EQUAL(in(i), in1(i));
}
sycl_device.deallocate(gpu_in_data);
}
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
VERIFY(sycl_device.ok());
- int sizeDim1 = 100;
- array<int, 1> tensorDims = {{sizeDim1}};
+ IndexType sizeDim1 = 100;
+ array<IndexType, 1> tensorDims = {{sizeDim1}};
DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
- TensorMap<Tensor<DataType, 1, DataLayout>> in(gpu_data, tensorDims);
- TensorMap<Tensor<DataType, 1, DataLayout>> out(gpu_data, tensorDims);
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
out.device(sycl_device) = in / in.constant(0);
sycl_device.synchronize();
@@ -62,8 +62,8 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
QueueInterface queueInterface(d);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_device_memory<DataType, RowMajor>(sycl_device);
- test_device_memory<DataType, ColMajor>(sycl_device);
+ test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+ test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
/// this test throw an exception. enable it if you want to see the exception
//test_device_exceptions<DataType, RowMajor>(sycl_device);
/// this test throw an exception. enable it if you want to see the exception
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 4d19a3b2a..aca036cde 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -14,23 +14,23 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 100;
- int sizeDim2 = 20;
- int sizeDim3 = 20;
- Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Eigen::Tensor<DataType, 3, DataLayout> in1(tensorRange);
- Eigen::Tensor<DataType, 3, DataLayout> in2(tensorRange);
- Eigen::Tensor<DataType, 3, DataLayout> out(tensorRange);
+ IndexType sizeDim1 = 100;
+ IndexType sizeDim2 = 20;
+ IndexType sizeDim3 = 20;
+ Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
@@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
in2 = in2.random() + in2.constant(10.0f);
// creating TensorMap from tensor
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
/// c=(a+b)*b
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k),
(in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
}
@@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_forced_eval_sycl<DataType, RowMajor>(sycl_device);
- test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
+ test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_forced_eval_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 91353b81a..9b521bc6b 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -16,7 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
@@ -28,18 +28,18 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
{
- typename Tensor<DataType, 5 ,DataLayout>::Dimensions dim1(2,3,1,7,1);
- typename Tensor<DataType, 3 ,DataLayout>::Dimensions dim2(2,3,7);
- typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim3(6,7);
- typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim4(2,21);
+ typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+ typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+ typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+ typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
- Tensor<DataType, 5, DataLayout> tensor1(dim1);
- Tensor<DataType, 3, DataLayout> tensor2(dim2);
- Tensor<DataType, 2, DataLayout> tensor3(dim3);
- Tensor<DataType, 2, DataLayout> tensor4(dim4);
+ Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
tensor1.setRandom();
@@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, dim1);
- TensorMap<Tensor<DataType, 3,DataLayout>> gpu2(gpu_data2, dim2);
- TensorMap<Tensor<DataType, 2,DataLayout>> gpu3(gpu_data3, dim3);
- TensorMap<Tensor<DataType, 2,DataLayout>> gpu4(gpu_data4, dim4);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+ TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+ TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+ TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
@@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
- for (int i = 0; i < 2; ++i){
- for (int j = 0; j < 3; ++j){
- for (int k = 0; k < 7; ++k){
+ for (IndexType i = 0; i < 2; ++i){
+ for (IndexType j = 0; j < 3; ++j){
+ for (IndexType k = 0; k < 7; ++k){
VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor
if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor
@@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
}
-template<typename DataType, int DataLayout>
+template<typename DataType, int DataLayout, typename IndexType>
static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
{
- typename Tensor<DataType, 3, DataLayout>::Dimensions dim1(2,3,7);
- typename Tensor<DataType, 2, DataLayout>::Dimensions dim2(6,7);
- typename Tensor<DataType, 5, DataLayout>::Dimensions dim3(2,3,1,7,1);
- Tensor<DataType, 3, DataLayout> tensor(dim1);
- Tensor<DataType, 2, DataLayout> tensor2d(dim2);
- Tensor<DataType, 5, DataLayout> tensor5d(dim3);
+ typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+ typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+ typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+ Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
tensor.setRandom();
@@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
- TensorMap< Tensor<DataType, 3, DataLayout> > gpu1(gpu_data1, dim1);
- TensorMap< Tensor<DataType, 2, DataLayout> > gpu2(gpu_data2, dim2);
- TensorMap< Tensor<DataType, 5, DataLayout> > gpu3(gpu_data3, dim3);
+ TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+ TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+ TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
@@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
- for (int i = 0; i < 2; ++i){
- for (int j = 0; j < 3; ++j){
- for (int k = 0; k < 7; ++k){
+ for (IndexType i = 0; i < 2; ++i){
+ for (IndexType j = 0; j < 3; ++j){
+ for (IndexType k = 0; k < 7; ++k){
VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor
@@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
}
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
{
- int sizeDim1 = 2;
- int sizeDim2 = 3;
- int sizeDim3 = 5;
- int sizeDim4 = 7;
- int sizeDim5 = 11;
- array<int, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- Tensor<DataType, 5,DataLayout> tensor(tensorRange);
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
tensor.setRandom();
- array<int, 5> slice1_range ={{1, 1, 1, 1, 1}};
- Tensor<DataType, 5,DataLayout> slice1(slice1_range);
+ array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 5,DataLayout>> gpu2(gpu_data2, slice1_range);
- Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
- Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+ Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
- array<int, 5> slice2_range ={{1,1,2,2,3}};
- Tensor<DataType, 5,DataLayout> slice2(slice2_range);
+ array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 5,DataLayout>> gpu3(gpu_data3, slice2_range);
- Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
- Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+ Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 2; ++j) {
- for (int k = 0; k < 3; ++k) {
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 2; ++j) {
+ for (IndexType k = 0; k < 3; ++k) {
VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
}
}
@@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
- for(int i=0;i<sizeDim1;i++) for(int j=0;j<sizeDim2;j++){
+ for(IndexType i=0;i<sizeDim1;i++)
+ for(IndexType j=0;j<sizeDim2;j++){
VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
}
sycl_device.deallocate(gpu_data1);
@@ -230,12 +231,12 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_slice<DataType, RowMajor>(sycl_device);
- test_simple_slice<DataType, ColMajor>(sycl_device);
- test_simple_reshape<DataType, RowMajor>(sycl_device);
- test_simple_reshape<DataType, ColMajor>(sycl_device);
- test_reshape_as_lvalue<DataType, RowMajor>(sycl_device);
- test_reshape_as_lvalue<DataType, ColMajor>(sycl_device);
+ test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+ test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+ test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+ test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
}
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index 9e86e4b52..dc748b73e 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -16,7 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
@@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
gpu2.device(sycl_device)=gpu1.pad(paddings);
sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
- for (int i = 0; i < padedSizeDim1; ++i) {
- for (int j = 0; j < padedSizeDim2; ++j) {
- for (int k = 0; k < padedSizeDim3; ++k) {
- for (int l = 0; l < padedSizeDim4; ++l) {
+ for (IndexType i = 0; i < padedSizeDim1; ++i) {
+ for (IndexType j = 0; j < padedSizeDim2; ++j) {
+ for (IndexType k = 0; k < padedSizeDim3; ++k) {
+ for (IndexType l = 0; l < padedSizeDim4; ++l) {
if (j >= 2 && j < 5 && k >= 3 && k < 8) {
VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
} else {
@@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 6; ++j) {
- for (int k = 0; k < 12; ++k) {
- for (int l = 0; l < 7; ++l) {
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 6; ++j) {
+ for (IndexType k = 0; k < 12; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
const float result_value = DataLayout == ColMajor ?
result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
if (j >= 2 && j < 5 && k >= 3 && k < 8) {
@@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_padding<DataType, RowMajor, int>(sycl_device);
- test_simple_padding<DataType, ColMajor, int>(sycl_device);
- test_padded_expr<DataType, RowMajor, int>(sycl_device);
- test_padded_expr<DataType, ColMajor, int>(sycl_device);
test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 941469029..440d48bca 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -14,97 +14,129 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
-template <typename DataType, int DataLayout>
-static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) {
- const int num_rows = 452;
- const int num_cols = 765;
- array<int, 2> tensorRange = {{num_rows, num_cols}};
+ const IndexType num_rows = 452;
+ const IndexType num_cols = 765;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
- Tensor<DataType, 2, DataLayout> in(tensorRange);
- Tensor<DataType, 0, DataLayout> full_redux;
- Tensor<DataType, 0, DataLayout> full_redux_gpu;
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
in.setRandom();
- full_redux = in.sum();
+ full_redux = in.mean();
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
- TensorMap<Tensor<DataType, 2, DataLayout> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 0, DataLayout> > out_gpu(gpu_out_data);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.sum();
+ out_gpu.device(sycl_device) = in_gpu.mean();
sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) {
+
+ const IndexType num_rows = 876;
+ const IndexType num_cols = 953;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+ in.setRandom();
+
+ full_redux = in.minimum();
+
+ DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
+ DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.minimum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-template <typename DataType, int DataLayout>
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
- int dim_x = 145;
- int dim_y = 1;
- int dim_z = 67;
- array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<int, 1> red_axis;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
+
+ IndexType dim_x = 145;
+ IndexType dim_y = 1;
+ IndexType dim_z = 67;
+
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
red_axis[0] = 0;
- array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
+ array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
- Tensor<DataType, 3, DataLayout> in(tensorRange);
- Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
- Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
in.setRandom();
- redux= in.sum(red_axis);
+ redux= in.maximum(red_axis);
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+ out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
- for(int j=0; j<reduced_tensorRange[0]; j++ )
- for(int k=0; k<reduced_tensorRange[1]; k++ )
+ for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+ for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-template <typename DataType, int DataLayout>
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {
- int dim_x = 567;
- int dim_y = 1;
- int dim_z = 47;
+ IndexType dim_x = 567;
+ IndexType dim_y = 1;
+ IndexType dim_z = 47;
- array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<int, 1> red_axis;
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
red_axis[0] = 2;
- array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+ array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
- Tensor<DataType, 3, DataLayout> in(tensorRange);
- Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
- Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
in.setRandom();
@@ -113,15 +145,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device)
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
- for(int j=0; j<reduced_tensorRange[0]; j++ )
- for(int k=0; k<reduced_tensorRange[1]; k++ )
+ for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+ for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
sycl_device.deallocate(gpu_in_data);
@@ -133,12 +165,14 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
QueueInterface queueInterface(d);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_full_reductions_sycl<DataType, RowMajor>(sycl_device);
- test_first_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
- test_last_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
- test_full_reductions_sycl<DataType, ColMajor>(sycl_device);
- test_first_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
- test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
+ test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_reduction_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 000000000..2f5484484
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -0,0 +1,221 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+
+ IndexType dim1 = 2;
+ IndexType dim2 = 3;
+ IndexType dim3 = 5;
+ IndexType dim4 = 7;
+
+ array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+ tensor.setRandom();
+
+ array<bool, 4> dim_rev;
+ dim_rev[0] = false;
+ dim_rev[1] = true;
+ dim_rev[2] = true;
+ dim_rev[3] = false;
+
+ DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+ DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+ }
+ }
+ }
+ }
+ dim_rev[0] = true;
+ dim_rev[1] = false;
+ dim_rev[2] = false;
+ dim_rev[3] = false;
+
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+ }
+ }
+ }
+ }
+
+ dim_rev[0] = true;
+ dim_rev[1] = false;
+ dim_rev[2] = false;
+ dim_rev[3] = true;
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue)
+{
+ IndexType dim1 = 2;
+ IndexType dim2 = 3;
+ IndexType dim3 = 5;
+ IndexType dim4 = 7;
+
+ array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+ tensor.setRandom();
+
+ array<bool, 4> dim_rev;
+ dim_rev[0] = false;
+ dim_rev[1] = true;
+ dim_rev[2] = false;
+ dim_rev[3] = true;
+
+ DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+ DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
+ DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange);
+
+
+ sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+
+ if (LValue) {
+ out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+ } else {
+ out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+ }
+ sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
+
+
+ array<IndexType, 4> src_slice_dim;
+ src_slice_dim[0] = 2;
+ src_slice_dim[1] = 3;
+ src_slice_dim[2] = 1;
+ src_slice_dim[3] = 7;
+ array<IndexType, 4> src_slice_start;
+ src_slice_start[0] = 0;
+ src_slice_start[1] = 0;
+ src_slice_start[2] = 0;
+ src_slice_start[3] = 0;
+ array<IndexType, 4> dst_slice_dim = src_slice_dim;
+ array<IndexType, 4> dst_slice_start = src_slice_start;
+
+ for (IndexType i = 0; i < 5; ++i) {
+ if (LValue) {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+ in_gpu.slice(src_slice_start, src_slice_dim);
+ } else {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+ in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+ }
+ src_slice_start[2] += 1;
+ dst_slice_start[2] += 1;
+ }
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+
+ for (IndexType i = 0; i < expected.dimension(0); ++i) {
+ for (IndexType j = 0; j < expected.dimension(1); ++j) {
+ for (IndexType k = 0; k < expected.dimension(2); ++k) {
+ for (IndexType l = 0; l < expected.dimension(3); ++l) {
+ VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ dst_slice_start[2] = 0;
+ result.setRandom();
+ sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+ for (IndexType i = 0; i < 5; ++i) {
+ if (LValue) {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+ in_gpu.slice(dst_slice_start, dst_slice_dim);
+ } else {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+ in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+ }
+ dst_slice_start[2] += 1;
+ }
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+
+ for (IndexType i = 0; i < expected.dimension(0); ++i) {
+ for (IndexType j = 0; j < expected.dimension(1); ++j) {
+ for (IndexType k = 0; k < expected.dimension(2); ++k) {
+ for (IndexType l = 0; l < expected.dimension(3); ++l) {
+ VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+ }
+ }
+ }
+ }
+}
+
+
+
+template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
+ std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+ test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+ test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+ test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+ test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+void test_cxx11_tensor_reverse_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index c4521aac8..c88db7c72 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -16,7 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
@@ -28,20 +28,20 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-template <typename DataType, int DataLayout, typename IndexTypes>
+template <typename DataType, int DataLayout, typename IndexType>
static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
{
- IndexTypes sizeDim1 = 2;
- IndexTypes sizeDim2 = 3;
- IndexTypes sizeDim3 = 5;
- IndexTypes sizeDim4 = 7;
- array<IndexTypes, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
- Tensor<DataType, 4, DataLayout,IndexTypes> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout,IndexTypes> no_shuffle(tensorRange);
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
tensor.setRandom();
const size_t buffSize =tensor.size()*sizeof(DataType);
- array<IndexTypes, 4> shuffles;
+ array<IndexType, 4> shuffles;
shuffles[0] = 0;
shuffles[1] = 1;
shuffles[2] = 2;
@@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu2(gpu_data2, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
@@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
}
}
@@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
shuffles[1] = 3;
shuffles[2] = 1;
shuffles[3] = 0;
- array<IndexTypes, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
- Tensor<DataType, 4, DataLayout,IndexTypes> shuffle(tensorrangeShuffle);
+ array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+ Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
- TensorMap<Tensor<DataType, 4,DataLayout,IndexTypes>> gpu3(gpu_data3, tensorrangeShuffle);
+ TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);
gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
@@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
}
}
@@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_shuffling_sycl<DataType, RowMajor, int>(sycl_device);
- test_simple_shuffling_sycl<DataType, ColMajor, int>(sycl_device);
-
test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 000000000..603c3746f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+ Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+ Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+ std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
+ std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+ std::size_t stride_bytes = stride.size() * sizeof(DataType);
+ DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+ DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+ DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+ tensor.setRandom();
+ array<IndexType, 4> strides;
+ strides[0] = 1;
+ strides[1] = 1;
+ strides[2] = 1;
+ strides[3] = 1;
+ sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+ gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+ //no_stride = tensor.stride(strides);
+
+ VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+ VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+ VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+ VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+// stride = tensor.stride(strides);
+
+ gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+ sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+ VERIFY_IS_EQUAL(stride.dimension(0), 1);
+ VERIFY_IS_EQUAL(stride.dimension(1), 1);
+ VERIFY_IS_EQUAL(stride.dimension(2), 3);
+ VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+ for (IndexType i = 0; i < 1; ++i) {
+ for (IndexType j = 0; j < 1; ++j) {
+ for (IndexType k = 0; k < 3; ++k) {
+ for (IndexType l = 0; l < 3; ++l) {
+ VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(d_tensor);
+ sycl_device.deallocate(d_no_stride);
+ sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+ Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+ Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+ std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
+ std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+ std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+ DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+ DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+ DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+ //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+ tensor.setRandom();
+ array<IndexType, 4> strides;
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+
+// Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+// result.stride(strides) = tensor;
+ sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+ gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+ sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> no_strides;
+ no_strides[0] = 1;
+ no_strides[1] = 1;
+ no_strides[2] = 1;
+ no_strides[3] = 1;
+// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+// result2.stride(strides) = tensor.stride(no_strides);
+
+ gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(d_tensor);
+ sycl_device.deallocate(d_no_stride);
+ sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+ QueueInterface queueInterface(s);
+ auto sycl_device=Eigen::SyclDevice(&queueInterface);
+ test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+ test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+ test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+ test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_striding_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorStridingPerDevice(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index d5c0cbaad..5cd0f4c71 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -16,7 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -27,24 +27,24 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 100;
- int sizeDim2 = 10;
- int sizeDim3 = 20;
- array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Tensor<DataType, 3, DataLayout> in1(tensorRange);
- Tensor<DataType, 3, DataLayout> out1(tensorRange);
- Tensor<DataType, 3, DataLayout> out2(tensorRange);
- Tensor<DataType, 3, DataLayout> out3(tensorRange);
+ IndexType sizeDim1 = 100;
+ IndexType sizeDim2 = 10;
+ IndexType sizeDim3 = 20;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
in1 = in1.random();
DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu2(gpu_data2, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
@@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < in1.size(); ++i) {
+ for (IndexType i = 0; i < in1.size(); ++i) {
VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
sycl_device.deallocate(gpu_data2);
}
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
- int size = 20;
- array<int, 1> tensorRange = {{size}};
- Tensor<DataType, 1, DataLayout> in1(tensorRange);
- Tensor<DataType, 1, DataLayout> in2(tensorRange);
- Tensor<DataType, 1, DataLayout> out(tensorRange);
+ IndexType size = 20;
+ array<IndexType, 1> tensorRange = {{size}};
+ Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
in1 = in1.random();
in2 = in1;
DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 1, DataLayout>> gpu1(gpu_data, tensorRange);
+ TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
sycl_device.synchronize();
in1.setZero();
@@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < in1.size(); ++i) {
+ for (IndexType i = 0; i < in1.size(); ++i) {
VERIFY_IS_APPROX(out(i), in2(i));
}
sycl_device.deallocate(gpu_data);
}
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 100;
- int sizeDim2 = 10;
- int sizeDim3 = 20;
- array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Tensor<DataType, 3,DataLayout> in1(tensorRange);
- Tensor<DataType, 3,DataLayout> in2(tensorRange);
- Tensor<DataType, 3,DataLayout> in3(tensorRange);
- Tensor<DataType, 3,DataLayout> out(tensorRange);
+ IndexType sizeDim1 = 100;
+ IndexType sizeDim2 = 10;
+ IndexType sizeDim3 = 20;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
in2 = in2.random();
in3 = in3.random();
@@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in3(gpu_in3_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
/// a=1.2f
gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
}
}
@@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 1.2f);
}
@@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in2(i,j,k));
@@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) +
in2(i,j,k));
@@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in1(i,j,k));
@@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 3.14f
+ in2(i,j,k) * 2.7f);
@@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
sycl_device.synchronize();
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
? in2(i, j, k)
: in3(i, j, k));
@@ -229,15 +229,44 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
sycl_device.deallocate(gpu_in3_data);
sycl_device.deallocate(gpu_out_data);
}
+template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+ IndexType size = 20;
+ array<IndexType, 1> tensorRange = {{size}};
+ Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+ Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+ Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+ in = in.random();
+
+ Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+ Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+ TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+ TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+ gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+ out_host = in. template cast<Scalar2>();
+ for(IndexType i=0; i< size; i++)
+ {
+ VERIFY_IS_APPROX(out(i), out_host(i));
+ }
+ printf("cast Test Passed\n");
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_sycl_mem_transfers<DataType, RowMajor>(sycl_device);
- test_sycl_computations<DataType, RowMajor>(sycl_device);
- test_sycl_mem_sync<DataType, RowMajor>(sycl_device);
- test_sycl_mem_transfers<DataType, ColMajor>(sycl_device);
- test_sycl_computations<DataType, ColMajor>(sycl_device);
- test_sycl_mem_sync<DataType, ColMajor>(sycl_device);
+ test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+ test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_sycl() {