diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-12-16 19:46:45 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-12-16 19:46:45 +0000 |
commit | 35bae513a0094f986c810c3f839e5a954caabd4b (patch) | |
tree | e0e89ec837c10509923e04fbeab7fd1f90563279 /unsupported | |
parent | 7949849ebcca49ce2730e767552eadfae0eb6e1a (diff) |
Converting all parallel for lambda to functor in order to prevent kernel duplication name error; adding tensorConcatinationOp backend for sycl.
Diffstat (limited to 'unsupported')
16 files changed, 588 insertions, 264 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 59bf90d93..2c7ba961c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -276,6 +276,12 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + const Axis& axis() const { return m_axis; } protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 0cc97c59d..b170a1a5c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -190,16 +190,168 @@ LeftEvaluator m_leftImpl; RightEvaluator m_rightImpl; }; -template <typename PLEXPR, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct KernelNameConstructor; + +template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, +typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, +int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN, +int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{ + + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + + FunctorExpr functors; + LhsLocalAcc localLhs; + RhsLocalAcc localRhs; + OutAccessor out_res; + Index roundUpK, M, N, K; + ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; + LeftNocontractT m_i_strides, m_left_nocontract_strides; + RightNocontractT m_j_strides, m_right_nocontract_strides; + TupleType tuple_of_accessors; + + KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, + ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, + LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_) + :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), + m_right_contracting_strides(m_right_contracting_strides_), + m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), + m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), + tuple_of_accessors(tuple_of_accessors_){} + + void operator()(cl::sycl::nd_item<1> itemID) { + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice()); + typedef TensorEvaluatorContainer<DevExpr> DevEvaluator; + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + typename DevEvaluator::LeftEvaluator, LeftNocontractT, + ContractT, 1, + lhs_inner_dim_contiguous, + false, Unaligned, MakeGlobalPointer> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + typename DevEvaluator::RightEvaluator, RightNocontractT, + ContractT, 1, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; + // initialize data mappers must happen inside the kernel for device eval + LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); + // Matmul Kernel + // Thread identifiers + const int mLocalThreadId = itemID.get_local(0); // Local ID row + const int nLocalThreadId = itemID.get_local(1); // Local ID col + const int mGroupId = itemID.get_group(0); // Work-group ID row + const int nGroupId = itemID.get_group(1); // Work-group ID localCol + const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID + // Allocate register space + float privateLhs; + float privateRhs[WorkLoadPerThreadN]; + float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + // Initialise the privateResumulation registers + for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + privateRes[wLPTM][wLPTN] = 0.0f; + } + } + + // Tile Lhs + for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + int + localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + int localLhsRow = localLhsLinearId% TileSizeDimM; + int localLhsCol = localLhsLinearId/TileSizeDimM; + // Load the value (wide vector load) + int GlobalLhsColId = TileSizeDimK*0 + localLhsCol; + localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0); + } + // Tile Rhs + for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + int localRhsRow = localRhsLinearId% TileSizeDimN; + int localRhsCol = localRhsLinearId/TileSizeDimN; + // Load the value (wide vector load) + int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; + localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0); + + } + // Loop over all tiles + const int numTiles = roundUpK/TileSizeDimK; + int firstHalf=0; + do { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // Load the next tile of Lhs and Rhs into local memory + int nextHalf = firstHalf + 1; + if (nextHalf < numTiles) { + // Tile A + for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + int localLhsRow = localLhsLinearId% TileSizeDimM; + int localLhsCol = localLhsLinearId/TileSizeDimM; + // global K id + int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; + // Store the loaded value into local memory + localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0); + } + // Tile B + for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + int localRhsRow = localRhsLinearId% TileSizeDimN; + int localRhsCol = localRhsLinearId/TileSizeDimN; + // Load the value (wide vector load) + int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; + // Store the loaded vector into local memory + localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0); + } + } + // Loop over the values of a single tile + for (int k=0; k<TileSizeDimK; k++) { + // Cache the values of localRhs in registers + for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; + privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; + } + // Perform the computation + for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; + privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; + for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; + } + } + } + // Next tile + firstHalf++; + } while (firstHalf<numTiles); + + + // Store the final results in C + for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; + if (globalRow< M){ + for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; + if(globalCol<N) + out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; + } + } + } + + } + +}; template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { static const int TileSizeDimM = 32; // Tile size for dimension M static const int TileSizeDimN = 32; // Tile size for dimension N -static const int TileSizeDimK = 16; // Tile size for dimension K +static const int TileSizeDimK = 16; // Tile size for dimension K static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N -static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) +static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) +static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression @@ -208,149 +360,39 @@ static int RoundUp(int x, int y) { return ((((x) + (y) - 1) / (y))*(y)); } -template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, Output* buffer, Index M, Index N, Index K, +template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> + static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ // create a tuple of accessors from Evaluator - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<typename Self::XprType>::Type PlaceHolderExpr; - typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName; + typedef typename Self::XprType HostExpr; + // typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + // typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName; auto functors = Eigen::TensorSycl::internal::extractFunctors(self); + typedef decltype(functors) FunctorExpr; Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self); + typedef decltype(tuple_of_accessors) TupleType; // Local memory for elements of Lhs - cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); + typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc; + LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); // Local memory for elements of Rhs - cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - //Output memory - auto out_privateRes= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); + typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; + RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); + //OutScalar memory + auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); + typedef decltype(out_res) OutAccessor; // sycl parallel for - cgh.parallel_for<KernelName>( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<typename Self::XprType>::Type DevExpr; - auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice()); - typedef TensorEvaluatorContainer<DevExpr> DevEvaluator; - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - typename DevEvaluator::LeftEvaluator, LeftNocontractT, - ContractT, 1, - lhs_inner_dim_contiguous, - false, Unaligned, MakeGlobalPointer> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - typename DevEvaluator::RightEvaluator, RightNocontractT, - ContractT, 1, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes); - // Matmul Kernel - // Thread identifiers - const int mLocalThreadId = itemID.get_local(0); // Local ID row - const int nLocalThreadId = itemID.get_local(1); // Local ID col - const int mGroupId = itemID.get_group(0); // Work-group ID row - const int nGroupId = itemID.get_group(1); // Work-group ID localCol - const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] = 0.0f; - } - } - - // Tile Lhs - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int - localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; - // Load the value (wide vector load) - int GlobalLhsColId = TileSizeDimK*0 + localLhsCol; - localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<Output>(0); - } - // Tile Rhs - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; - localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<Output>(0); - - } - // Loop over all tiles - const int numTiles = roundUpK/TileSizeDimK; - int firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - int nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; - // global K id - int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; - // Store the loaded value into local memory - localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<Output>(0); - } - // Tile B - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; - // Store the loaded vector into local memory - localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<Output>(0); - } - } - // Loop over the values of a single tile - for (int k=0; k<TileSizeDimK; k++) { - // Cache the values of localRhs in registers - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; - privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; - } - // Perform the computation - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; - privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; - } - } - } - // Next tile - firstHalf++; - } while (firstHalf<numTiles); - - - // Store the final results in C - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; - if (globalRow< M){ - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; - if(globalCol<N) - out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; - } - } - } - - /// End the kernel - }); + cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), + cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), + KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, + RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors, + localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors)); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 96c95e294..d444f3cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -43,6 +43,18 @@ namespace Eigen { size_t m_offset; }; + struct memsetkernelFunctor{ + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; + AccType m_acc; + const size_t m_rng, m_c; + memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + void operator()(cl::sycl::nd_item<1> itemID) { + auto globalid=itemID.get_global_linear_id(); + if (globalid< m_rng) m_acc[globalid] = m_c; + } + + }; + EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector<cl::sycl::device>::iterator it =devices.begin(); @@ -88,15 +100,17 @@ struct QueueInterface { } } })) - #else - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - } - } - })) - #endif +#else +m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { + for (const auto& e : l) { + if (e) { + exception_caught_ = true; + std::cerr << "Error detected Inside Sycl Device."<< std::endl; + + } + } +})) +#endif {} /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. @@ -256,22 +270,26 @@ struct SyclDevice { /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. - template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const { + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< rng) { - for(size_t i=0; i<sizeof(T); i++) - buf_acc[globalid*sizeof(T) + i] = c; - } - }); - }); + parallel_for_setup(n, tileSize, rng, GRange); + sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); asynchronousExec(); } + struct memsetCghFunctor{ + cl::sycl::buffer<uint8_t, 1>& m_buf; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + } + }; + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME return 48*1024; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 06987132b..82dd1e640 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -41,6 +41,9 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + + }; }; @@ -117,7 +120,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { return m_op; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index a68010c55..d6415817b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -69,7 +69,9 @@ struct TensorEvaluator return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::traits<Derived>::template MakePointer<Scalar>::RefType + coeffRef(Index index) { eigen_assert(m_data); return m_data[index]; } @@ -95,7 +97,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::traits<Derived>::template MakePointer<Scalar>::RefType + coeffRef(const array<DenseIndex, NumCoords>& coords) { eigen_assert(m_data); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index bbd5eb374..930837021 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -46,6 +46,8 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + }; }; @@ -107,7 +109,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl + /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 499582a4c..e6aa0f334 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,16 +20,20 @@ namespace Eigen { // map_allocator. template<typename T> struct MakePointer { typedef T* Type; + typedef T& RefType; }; #if defined(EIGEN_USE_SYCL) namespace TensorSycl { namespace internal{ -template <typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor; +template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor; +template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> +struct FullReductionKernelFunctor; } } #endif + template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap; template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor; template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 75518a854..c841786b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -693,10 +693,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, #endif #if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - + template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; + template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor; #endif + template <typename S, typename O, typename D> friend struct internal::InnerReducer; // Returns the Index in the input tensor of the first value that needs to be diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index d5bc7b71b..c9912d9d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,8 +25,7 @@ namespace Eigen { namespace internal { - -template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{ +template<typename CoeffReturnType> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { @@ -35,50 +34,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ - auto aI = - bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut = - bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); - cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, - cl::sycl::access::target::local> - scratch(cl::sycl::range<1>(local), h); + auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); + auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); + typedef decltype(aI) InputAccessor; + typedef decltype(aOut) OutputAccessor; + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; + LocalAccessor scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for<KernelName>( - r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { - size_t globalid = id.get_global(0); - size_t localid = id.get_local(0); - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - if (globalid < length) { - scratch[localid] = aI[globalid]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - if (globalid < length) { - auto min = (length < local) ? length : local; - for (size_t offset = min / 2; offset > 0; offset /= 2) { - if (localid < offset) { - scratch[localid] += scratch[localid + offset]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - } - /* The final result will be stored in local id 0. */ - if (localid == 0) { - aI[id.get_group(0)] = scratch[localid]; - if((length<=local) && globalid ==0){ - auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; - } - } - } - }); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -96,11 +61,11 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de }; -/// For now let's start with a full reducer /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. + template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { @@ -109,8 +74,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); + typedef decltype(functors) FunctorExpr; int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -135,48 +100,29 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; - Dims dims= self.xprDims(); - Op functor = reducer; + // Dims dims= self.xprDims(); + //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + typedef decltype(tuple_of_accessors) TupleType; auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); - - cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - - if(globalid<rng) - tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(functor)); - else - tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); - - if(remaining!=0 && globalid==0 ) - // this will add the rest of input buffer when the input size is not devidable to red_factor. - tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor)); - }); + typedef decltype(tmp_global_accessor) OutAccessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), + TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType> + (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); }); dev.asynchronousExec(); -/// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); + // getting final out buffer at the moment the created buffer is true because there is no need for assign + auto out_buffer =dev.get_sycl_buffer(output); + /// This is used to recursively reduce the tmp value to an element of 1; + syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -190,7 +136,6 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); typedef decltype(functors) FunctorExpr; typename Self::Index range, GRange, tileSize; @@ -208,7 +153,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> + TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); }); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index d7cbb420f..2e61ee049 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -20,12 +20,14 @@ template <class T> struct MakeGlobalPointer { typedef typename cl::sycl::global_ptr<T>::pointer_t Type; + typedef typename cl::sycl::global_ptr<T>::reference_t RefType; }; // global pointer to set different attribute state for a class template <class T> struct MakeLocalPointer { typedef typename cl::sycl::local_ptr<T>::pointer_t Type; + typedef typename cl::sycl::local_ptr<T>::reference_t RefType; }; @@ -33,6 +35,9 @@ namespace Eigen { namespace TensorSycl { namespace internal { + template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; + + /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 4376a0e3c..6f9ab57af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -241,21 +241,25 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const) PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), ) #undef PADDINGOPFUNCEXT -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type. -#define SYCLEXTRFUNCCONTRACT(CVQual)\ -template <typename Indices, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev> > {\ +/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp +/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type. +#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\ +template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\ +struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\ FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const Indices func;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorContractionOp<Indices, LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\ + const Param func;\ + FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\ + : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ }; -SYCLEXTRFUNCCONTRACT(const) -SYCLEXTRFUNCCONTRACT() -#undef SYCLEXTRFUNCCONTRACT +// TensorContractionOp +SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const) +SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),) +// TensorConcatenationOp +SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) +SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) +#undef SYCLEXTRFUNCCONTRACTCONCAT /// template deduction function for FunctorExtractor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 56488d5d7..85c280588 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -18,9 +18,53 @@ namespace Eigen { namespace TensorSycl { namespace internal { + template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ + OutputAccessor aOut; + InputAccessor aI; + LocalAccessor scratch; + size_t length, local; + GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + void operator()(cl::sycl::nd_item<1> itemID) { + size_t globalid = itemID.get_global(0); + size_t localid = itemID.get_local(0); + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + if (globalid < length) { + scratch[localid] = aI[globalid]; + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + if (globalid < length) { + auto min = (length < local) ? length : local; + for (size_t offset = min / 2; offset > 0; offset /= 2) { + if (localid < offset) { + scratch[localid] += scratch[localid + offset]; + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + /* The final result will be stored in local id 0. */ + if (localid == 0) { + aI[itemID.get_group(0)] = scratch[localid]; + if((length<=local) && globalid ==0){ + auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); + aOutPtr[0]=scratch[0]; + } + } + } + } + + }; + /// ReductionFunctor -template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { +template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_) :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} @@ -56,6 +100,46 @@ template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, ty }; +template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> +struct FullReductionKernelFunctor{ + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + OutAccessor tmp_global_accessor; + Index rng , remaining, red_factor; + Op op; + Dims dims; + FunctorExpr functors; + TupleType tuple_of_accessors; + + FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc) + :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} + + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + + if(globalid<rng) + tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)); + else + tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); + + if(remaining!=0 && globalid==0 ) + // this will add the rest of input buffer when the input size is not devidable to red_factor. + tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); + } +}; + + + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index c941abf5c..5862c9795 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,6 +25,31 @@ namespace Eigen { namespace TensorSycl { + + + template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ + typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; + + typedef typename Expr::Index Index; + Index range; + FunctorExpr functors; + TupleType tuple_of_accessors; + ExecExprFunctorKernel(Index range_ + , + FunctorExpr functors_, TupleType tuple_of_accessors_ + ) + :range(range_) + , functors(functors_), tuple_of_accessors(tuple_of_accessors_) + {} + void operator()(cl::sycl::nd_item<1> itemID) { + typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; + auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); + if (gId < range) + device_evaluator.evalScalar(gId); + } + }; /// The run function in tensor sycl convert the expression tree to a buffer /// based expression tree; /// creates the expression tree for the device with accessor to buffers; @@ -34,25 +59,19 @@ void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; auto functors = internal::extractFunctors(evaluator); - + typedef decltype(functors) FunctorExpr; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator); + typedef decltype(tuple_of_accessors) TupleType; typename Expr::Index range, GRange, tileSize; dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange); - // run the kernel - cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; - auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); - if (gId < range) { - device_evaluator.evalScalar(gId); - } - }); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), + ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range + , functors, tuple_of_accessors + )); }); dev.asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index ffcf8b00f..a1e944e59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -58,6 +58,8 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; + typedef T& RefType; + }; }; @@ -76,6 +78,8 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; + typedef T& RefType; + }; }; @@ -98,6 +102,8 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + }; }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 0405ee9fa..daedb671c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -151,6 +151,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp new file mode 100644 index 000000000..5a324b44c --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -0,0 +1,180 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<typename DataType, int DataLayout, typename Index> +static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) +{ + Index leftDim1 = 2; + Index leftDim2 = 3; + Index leftDim3 = 1; + Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; + Index rightDim1 = 2; + Index rightDim2 = 3; + Index rightDim3 = 1; + Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; + + //Index concatDim1 = 3; +// Index concatDim2 = 3; +// Index concatDim3 = 1; + //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; + + Tensor<DataType, 3, DataLayout, Index> left(leftRange); + Tensor<DataType, 3, DataLayout, Index> right(rightRange); + left.setRandom(); + right.setRandom(); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + /// + Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + + //concatenation = left.concatenate(right, 0); + gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); + sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); + } + for (int i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); + } + } + + sycl_device.deallocate(gpu_out_data1); + Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); + sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); + } + for (int j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); + } + } + sycl_device.deallocate(gpu_out_data2); + Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); + sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); + } + } + sycl_device.deallocate(gpu_out_data3); + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); +} +template<typename DataType, int DataLayout, typename Index> +static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Index leftDim1 = 2; + Index leftDim2 = 3; + Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}}; + + Index rightDim1 = 2; + Index rightDim2 = 3; + Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}}; + + Index concatDim1 = 4; + Index concatDim2 = 3; + Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}}; + + Tensor<DataType, 2, DataLayout, Index> left(leftRange); + Tensor<DataType, 2, DataLayout, Index> right(rightRange); + Tensor<DataType, 2, DataLayout, Index> result(resRange); + + left.setRandom(); + right.setRandom(); + result.setRandom(); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange); + + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + +// t1.concatenate(t2, 0) = result; + gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out; + sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(left(i, j), result(i, j)); + VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); + } + } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); +} + + +template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_concatenation<DataType, RowMajor, int>(sycl_device); + test_simple_concatenation<DataType, ColMajor, int>(sycl_device); + test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device); +} +void test_cxx11_tensor_concatenation_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConcat_perDevice<float>(device)); + } +} |