diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-02-28 17:16:14 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-02-28 17:16:14 +0000 |
commit | 8296b87d7bd98c19c6064241880691f164790ede (patch) | |
tree | bbd18de82debbf021c7643017f9588a16374934f /unsupported/Eigen/CXX11/src/Tensor | |
parent | e0bd6f5738b94e8d7a4b17b61bf9cb6418685f28 (diff) |
Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
12 files changed, 230 insertions, 66 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index abc7ba551..fcd7d4d00 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { + if (data) { evalTo(data); return false; } else { @@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; + size_t out_offset; Index roundUpK, M, N, K; ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; @@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS Device dev; - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), + out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), @@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; if(globalCol<N) - out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; + out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN]; } } } @@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo // extract lhs functor list LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl()); Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); - + ptrdiff_t out_offset = self.device().get_offset(buffer); self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { /// work-around for gcc bug typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; @@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor; //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - + OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer); // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 4247c1c4a..66ffd819f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -32,14 +32,15 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize, range_x, range_y; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; @@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -89,14 +90,15 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; @@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -156,16 +158,17 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; @@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } itemID.barrier(cl::sycl::access::fence_space::local_space); @@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr m_kernel = in_place; m_local_kernel = false; } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); Scalar* local = (Scalar*)m_device.allocate(kernel_sz); typedef TensorEvalToOp<const KernelArgType> EvalTo; EvalTo evalToTmp(local, m_kernelArg); @@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; // extract input functor list InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + ptrdiff_t out_offset = m_device.get_offset(data); m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { @@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..c72d79435 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,6 +140,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result( @@ -295,6 +298,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799bb..964222a15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -18,6 +18,8 @@ namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) + #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) + template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { public: @@ -43,11 +45,12 @@ namespace Eigen { struct memsetkernelFunctor{ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; AccType m_acc; + const ptrdiff_t buff_offset; const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} void operator()(cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; + if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; } }; @@ -305,6 +308,11 @@ struct SyclDevice { synchronize(); } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + auto it = m_queue_stream->find_buffer(ptr); + return (static_cast<const uint8_t*>(ptr))-it->first; + + } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the @@ -343,20 +351,23 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); + auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(data)); + ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first; + sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); synchronize(); } struct memsetCghFunctor{ cl::sycl::buffer<uint8_t, 1>& m_buf; + const ptrdiff_t& buff_offset; const size_t& rng , GRange, tileSize; const int &c; - memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129e2..c9c7acfdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,9 +27,9 @@ namespace internal { template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ template<typename BufferTOut, typename BufferTIn> -static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), - bufOut, bufI, dev, length, local); + bufOut, out_offset, bufI, dev, length, local); } }; @@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); + ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -158,10 +159,11 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); + ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index dd63a2e2f..9476c0ea8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -109,6 +109,7 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ }; + // TensorForcedEvalOp KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 117b368ec..af4eb5f13 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -236,8 +236,12 @@ EVALTO() template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ - TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ + typedef TensorForcedEvalOp<OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ @@ -248,6 +252,28 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + + +#define TENSORCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\ + typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ +}; + +TENSORCUSTOMUNARYOP(const) +TENSORCUSTOMUNARYOP() +#undef TENSORCUSTOMUNARYOP + template <bool Conds, size_t X , size_t Y > struct ValueCondition { static const size_t Res =X; }; @@ -260,7 +286,7 @@ template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\ CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\ - static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ + static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\ NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ Type expr;\ @@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR() /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp +#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ - typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ - NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ - typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ - Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ + typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\ + static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\ + typedef CVQual TensorMap<\ + Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\ + Eigen::internal::traits<XprTyp>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - +SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) +#undef SYCLCONTRACTCONVCUSBIOPS #define SYCLSLICEOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 4a6322d44..5a6a8f4c5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC +#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, typename Dev >\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + + +SYCLCUSTOMUNARYOPEXTACC(const) +SYCLCUSTOMUNARYOPEXTACC() +#undef SYCLCUSTOMUNARYOPEXTACC + + +#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + +SYCLCUSTOMBINARYOPEXTACC(const) +SYCLCUSTOMBINARYOPEXTACC() +#undef SYCLCUSTOMBIBARYOPEXTACC + + + + /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template <typename Expr, typename Dev>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 8828a0495..9fcac5ecb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -33,14 +33,17 @@ namespace internal { /// re-instantiate them on the device. /// We have to pass instantiated functors to the device. // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). +#define DEFALTACTION(Evaluator)\ +typedef typename Evaluator::Dimensions Dimensions;\ +const Dimensions m_dimensions;\ +EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ +FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} + template <typename Evaluator> struct FunctorExtractor{ - typedef typename Evaluator::Dimensions Dimensions; - const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - FunctorExtractor(const Evaluator& expr) - : m_dimensions(expr.dimensions()) {} + DEFALTACTION(Evaluator) }; + /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything ///TensorConversionOp #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ @@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const) SYCLEXTRFUNCTERNARY() #undef SYCLEXTRFUNCTERNARY + + +//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different +//from the UnaryCategory and it is similar to the general FunctorExtractor. +/// specialisation of TensorCustomOp +#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename ArgType, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMUNARYOP(const) +SYCLEXTRFUNCCUSTOMUNARYOP() +#undef SYCLEXTRFUNCCUSTOMUNARYOP + + +#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ +template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMBIBARYOP(const) +SYCLEXTRFUNCCUSTOMBIBARYOP() +#undef SYCLEXTRFUNCCUSTOMBIBARYOP + + + /// specialisation of the \ref FunctorExtractor struct when the node type is /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCSELECTOP(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 2f7779036..12237bfab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -21,11 +21,12 @@ namespace internal { template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ OP op; OutputAccessor aOut; + ptrdiff_t out_offset; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -59,7 +60,7 @@ namespace internal { aI[itemID.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; + aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; } } } @@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum; + output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -111,9 +113,9 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; + output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 50f4595fc..330283b39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const) SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT +#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ +template <typename CustomUnaryFunc, typename XprType>\ +struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\ +static const size_t Count =1;\ +}; + +SYCLCUSTOMUNARYOPLEAFCOUNT(const) +SYCLCUSTOMUNARYOPLEAFCOUNT() +#undef SYCLCUSTOMUNARYOPLEAFCOUNT + + +#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\ +struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\ +static const size_t Count =1;\ +}; +SYCLCUSTOMBINARYOPLEAFCOUNT( const) +SYCLCUSTOMBINARYOPLEAFCOUNT() +#undef SYCLCUSTOMBINARYOPLEAFCOUNT + /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp #define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ template <typename Expr>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index fcef0be04..99d528963 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -143,6 +143,33 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMUNARYOPEVAL(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\ +}; + +CUSTOMUNARYOPEVAL(const) +CUSTOMUNARYOPEVAL() +#undef CUSTOMUNARYOPEVAL + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMBINARYOPEVAL(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\ +}; + +CUSTOMBINARYOPEVAL(const) +CUSTOMBINARYOPEVAL() +#undef CUSTOMBINARYOPEVAL + + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorEvalToOp, TensorLayoutSwapOp #define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ |