diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-03-07 14:27:10 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-03-07 14:27:10 +0000 |
commit | f84963ed95ff277bf3abb2e2517b3017a25ccf3f (patch) | |
tree | b9616be8fe4f8048287a147d070288701457ea3c /unsupported/Eigen | |
parent | 8296b87d7bd98c19c6064241880691f164790ede (diff) |
Adding TensorIndexTuple and TensorTupleReduceOP backend (ArgMax/Min) for sycl; fixing the address space issue for const TensorMap; converting all discard_write to write due to data missmatch.
Diffstat (limited to 'unsupported/Eigen')
17 files changed, 564 insertions, 226 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index d06f40cd8..e81001c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,6 +119,12 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { + return m_impl; + } + + protected: TensorEvaluator<ArgType, Device> m_impl; }; @@ -222,7 +228,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { + m_return_dim(op.return_dim()), m_device(device) { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast<int>(ColMajor)) { @@ -252,7 +258,16 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } + #ifndef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + #else // following functions are required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} + const Device& device() const{return m_device;} + #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -292,6 +307,8 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; + // required by sycl + const Device& m_device; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h new file mode 100644 index 000000000..90cbe004f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorArgMaxSycl.h + * \brief: + * TensorArgMaxSycl + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +namespace Eigen { +namespace internal { + template<typename Dims, typename XprType> + struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense> + { + typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type; + }; + + template<typename Dims, typename XprType> + struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1, + typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type> + { + typedef TensorTupleReducerDeviceOp<Dims, XprType> type; + }; + +template<typename StrideDims, typename XprType> +struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + + +}// end namespace internal +template<typename StrideDims, typename XprType> +class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, + const int return_dim, + const StrideDims& strides, + const Index& stride_mod, const Index& stride_div) + :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + EIGEN_DEVICE_FUNC + const StrideDims& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const Index& stride_mod() const { return m_stride_mod; } + + EIGEN_DEVICE_FUNC + const Index& stride_div() const { return m_stride_div; } + + protected: + typename Eigen::internal::remove_all<typename + XprType::Nested + >::type m_xpr; + const int m_return_dim; + const StrideDims& m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + + +// Eval as rvalue +template<typename StrideDims, typename ArgType> +struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice> +{ + typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Index Scalar; + typedef Index CoeffReturnType; + typedef typename XprType::CoeffReturnType TupleType; + typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) + : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), + m_stride_div(op.stride_div()){} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } +typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); } + +protected: + TensorEvaluator<ArgType , SyclKernelDevice> m_impl; + const int m_return_dim; + const StrideDims& m_strides; + const Index& m_stride_mod; + const Index& m_stride_div; +}; +} // end namespace Eigen +#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index fcd7d4d00..5b4c3c5bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -11,7 +11,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclConvertToDeviceExpression.h + * TensorTensorContractionsycl.h * * \brief: * TensorContractionsycl @@ -389,9 +389,9 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 66ffd819f..5db16d559 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -45,7 +45,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -103,7 +103,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -173,7 +173,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -339,8 +339,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr // create input tuple of accessors InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data); typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 964222a15..258218463 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,9 +41,8 @@ namespace Eigen { size_t m_i; size_t m_offset; }; - +template<typename AccType> struct memsetkernelFunctor{ - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; AccType m_acc; const ptrdiff_t buff_offset; const size_t m_rng, m_c; @@ -55,15 +54,19 @@ namespace Eigen { }; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector<cl::sycl::device>::iterator it =devices.begin(); while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + ///FIXME: Currently there is a bug in amd cpu OpenCL auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); + //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. + }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){ + it=devices.erase(it); } else{ ++it; @@ -112,6 +115,154 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} +//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop +template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard<std::mutex> lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>(); + ::memcpy(host_acc.get_pointer(), src, n); +} + +template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + std::lock_guard<std::mutex> lock(mutex_); + // Assuming that the dst is the start of the destination pointer +auto it =find_buffer(src); +auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; +offset/=sizeof(Index); +size_t rng, GRange, tileSize; +parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + +} + +EIGEN_STRONG_INLINE void synchronize() const { + std::lock_guard<std::mutex> lock(mutex_); + m_queue.wait_and_throw(); //pass +} +EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled + std::lock_guard<std::mutex> lock(mutex_); + m_queue.wait_and_throw(); //pass + +} + +template<typename Index> +EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); + auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); + } + rng = n; + if (rng==0) rng=static_cast<Index>(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast<Index>(GRange % tileSize); + if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); + } +} + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template<typename Index> +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); + } + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } +} + + + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template<typename Index> +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); + } + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast<Index>(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast<Index>(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); + } + pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } +} + + +EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + std::lock_guard<std::mutex> lock(mutex_); + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); +// return stream_->deviceProperties().multiProcessorCount; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + std::lock_guard<std::mutex> lock(mutex_); + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + +// return stream_->deviceProperties().maxThreadsPerBlock; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + std::lock_guard<std::mutex> lock(mutex_); + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); +// return stream_->deviceProperties().maxThreadsPerMultiProcessor; +} +EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + std::lock_guard<std::mutex> lock(mutex_); + return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); +// return stream_->deviceProperties().sharedMemPerBlock; +} /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key @@ -119,10 +270,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + std::lock_guard<std::mutex> lock(mutex_); auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); buf.set_final_data(nullptr); - std::lock_guard<std::mutex> lock(mutex_); buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); return static_cast<void*>(ptr); } @@ -193,48 +344,13 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); - auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast<Index>(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast<Index>(GRange % tileSize); - if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); - } + m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); - } - tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } @@ -242,39 +358,8 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast<Index>(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast<Index>(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); - } - pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); - } - tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { @@ -319,8 +404,7 @@ struct SyclDevice { /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); - ::memcpy(host_acc.get_pointer(), src, n); + m_queue_stream->memcpyHostToDevice(dst,src,n); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -329,21 +413,7 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); + m_queue_stream->memcpyDeviceToHost(dst,src,n); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -366,8 +436,9 @@ struct SyclDevice { :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c)); } }; @@ -403,14 +474,13 @@ struct SyclDevice { EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass + m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - + m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -418,8 +488,10 @@ struct SyclDevice { return m_queue_stream->ok(); } }; - - +// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. +// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator +// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation +struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d6415817b..8516b37b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -193,7 +193,12 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifndef __SYCL_DEVICE_ONLY__ return loadConstant(m_data+index); +#else + CoeffReturnType tmp = m_data[index]; + return tmp; +#endif } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d55..77c9c6c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -124,7 +124,9 @@ template <typename U, typename V> struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { + #ifndef __SYCL_DEVICE_ONLY__ if (&rhs == this) return *this; + #endif first = rhs.first; second = rhs.second; return *this; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c9c7acfdc..94899252b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -35,7 +35,7 @@ static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); + auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; @@ -158,7 +158,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); + auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output); ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4c1..3d6270614 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -32,6 +32,8 @@ struct MakeLocalPointer { namespace Eigen { + template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp; + template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>; namespace TensorSycl { namespace internal { @@ -48,6 +50,13 @@ template<typename T> struct GetType<false, T>{ typedef T Type; }; +template <bool Conds, size_t X , size_t Y > struct ValueCondition { + static const size_t Res =X; +}; +template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { + static const size_t Res =Y; +}; + } } } @@ -80,6 +89,9 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +// TensorArgMaxSycl.h +#include "TensorArgMaxSycl.h" + /// this is used for extracting tensor convolution #include "TensorConvolutionSycl.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 9476c0ea8..d6ac7b91f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -103,7 +103,7 @@ KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp -#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ @@ -111,15 +111,17 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ // TensorForcedEvalOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) // TensorLayoutSwapOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) -#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP - +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) +//TensorIndexTupleOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ @@ -132,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ +template <typename OP, typename Dim, typename subExpr>\ +struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\ + typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\ +}; + +KERNELBROKERCONVERTTUPLEREDUCTION(const) +KERNELBROKERCONVERTTUPLEREDUCTION() +#undef KERNELBROKERCONVERTTUPLEREDUCTION + +//TensorSlicingOp #define KERNELBROKERCONVERTSLICEOP(CVQual)\ template<typename StartIndices, typename Sizes, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ @@ -142,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const) KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP - +//TensorStridingSlicingOp #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ @@ -153,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const) KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ template <DenseIndex DimId, typename Expr>\ @@ -164,9 +177,6 @@ KERNELBROKERCONVERTCHIPPINGOP(const) KERNELBROKERCONVERTCHIPPINGOP() #undef KERNELBROKERCONVERTCHIPPINGOP - - - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ @@ -188,8 +198,6 @@ KERNELBROKERCONVERTVOLUMEPATCHOP(const) KERNELBROKERCONVERTVOLUMEPATCHOP() #undef KERNELBROKERCONVERTVOLUMEPATCHOP - - } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index af4eb5f13..24cc23f45 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -65,7 +65,6 @@ CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP @@ -83,6 +82,7 @@ CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Option ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ }; + TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE() #undef TENSORMAPFIXEDSIZE @@ -189,9 +189,6 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN() #undef ASSIGN - - - /// specialisation of the \ref ExprConstructor struct when the node type is /// const TensorAssignOp #define CONVERSIONEXPRCONST(CVQual)\ @@ -252,8 +249,6 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL - - #define TENSORCUSTOMUNARYOP(CVQual)\ template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ @@ -274,13 +269,6 @@ TENSORCUSTOMUNARYOP(const) TENSORCUSTOMUNARYOP() #undef TENSORCUSTOMUNARYOP -template <bool Conds, size_t X , size_t Y > struct ValueCondition { - static const size_t Res =X; -}; -template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { - static const size_t Res =Y; -}; - /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp #define SYCLREDUCTIONEXPR(CVQual)\ template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ @@ -299,6 +287,35 @@ SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp +/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. +#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ +template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\ + static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\ + static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\ +static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\ + typedef CVQual TensorMap<\ + Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\ + Layout,\ + MakeGlobalPointer\ + > XprType;\ + typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\ + static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\ + typedef array<Index, NumDims> StrideDims;\ + typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\ + fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ + }\ +}; + +SYCLTUPLEREDUCTIONEXPR(const) +SYCLTUPLEREDUCTIONEXPR() +#undef SYCLTUPLEREDUCTIONEXPR /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp @@ -319,15 +336,18 @@ CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; +//TensorContractionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +//TensorCustomBinaryOp SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) #undef SYCLCONTRACTCONVCUSBIOPS - +//TensorSlicingOp #define SYCLSLICEOPEXPR(CVQual)\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ @@ -344,7 +364,7 @@ SYCLSLICEOPEXPR(const) SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR - +//TensorStridingSlicingOp #define SYCLSLICESTRIDEOPEXPR(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ @@ -361,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const) SYCLSLICESTRIDEOPEXPR() #undef SYCLSLICESTRIDEOPEXPR +//TensorReshapingOp and TensorShufflingOp #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -373,13 +394,15 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ }; +// TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) - +// TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST +//TensorPaddingOp #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -392,11 +415,11 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ }; +//TensorPaddingOp SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) #undef SYCLPADDINGOPEXPRCONST - // TensorChippingOp #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ @@ -454,14 +477,12 @@ SYCLTENSORVOLUMEPATCHOPEXPR(const) SYCLTENSORVOLUMEPATCHOPEXPR() #undef SYCLTENSORVOLUMEPATCHOPEXPR - - -// TensorLayoutSwapOp -#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ +// TensorLayoutSwapOp and TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ template<typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorLayoutSwapOp <OrigXprType> , CVQual TensorLayoutSwapOp<XprType>, Params... >{\ +struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\ typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorLayoutSwapOp<typename my_xpr_type::Type> Type;\ + typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\ my_xpr_type xprExpr;\ Type expr;\ template <typename FuncDetector>\ @@ -469,10 +490,14 @@ struct ExprConstructor<CVQual TensorLayoutSwapOp <OrigXprType> , CVQual TensorLa : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ }; -SYCLTENSORLAYOUTSWAPOPEXPR(const) -SYCLTENSORLAYOUTSWAPOPEXPR() -#undef SYCLTENSORLAYOUTSWAPOPEXPR +//TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR /// template deduction for \ref ExprConstructor struct template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 5a6a8f4c5..fb95af59e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -147,7 +147,7 @@ SYCLFORCEDEVALEXTACC(const) SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC - +//TensorCustomUnaryOp #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ template <typename CustomUnaryFunc, typename XprType, typename Dev >\ struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ @@ -160,7 +160,7 @@ SYCLCUSTOMUNARYOPEXTACC(const) SYCLCUSTOMUNARYOPEXTACC() #undef SYCLCUSTOMUNARYOPEXTACC - +//TensorCustomBinaryOp #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ @@ -172,9 +172,6 @@ SYCLCUSTOMBINARYOPEXTACC(const) SYCLCUSTOMBINARYOPEXTACC() #undef SYCLCUSTOMBIBARYOPEXTACC - - - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template <typename Expr, typename Dev>\ @@ -188,15 +185,19 @@ SYCLEVALTOEXTACC() #undef SYCLEVALTOEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ +#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; +// TensorReductionOp +SYCLREDUCTIONEXTACC(const,TensorReductionOp) +SYCLREDUCTIONEXTACC(,TensorReductionOp) -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() +// TensorTupleReducerOp +SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) +SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) #undef SYCLREDUCTIONEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp @@ -206,14 +207,14 @@ template<typename Indices, typename LhsXprType, typename RhsXprType, typename De static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; - +//TensorContractionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) #undef SYCLCONTRACTIONCONVOLUTIONEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ @@ -252,7 +253,6 @@ SYCLTENSORCHIPPINGOPEXTACC(const) SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorImagePatchOp. #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ @@ -266,8 +266,6 @@ SYCLTENSORIMAGEPATCHOPEXTACC(const) SYCLTENSORIMAGEPATCHOPEXTACC() #undef SYCLTENSORIMAGEPATCHOPEXTACC - - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorVolumePatchOp. #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ @@ -281,21 +279,23 @@ SYCLTENSORVOLUMEPATCHOPEXTACC(const) SYCLTENSORVOLUMEPATCHOPEXTACC() #undef SYCLTENSORVOLUMEPATCHOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorLayoutSwapOp. -#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ +/// TensorLayoutSwapOp, TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ template<typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev>& eval)\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; -SYCLTENSORLAYOUTSWAPOPEXTACC(const) -SYCLTENSORLAYOUTSWAPOPEXTACC() -#undef SYCLTENSORLAYOUTSWAPOPEXTACC - +// TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC /// template deduction for \ref ExtractAccessor template <typename Evaluator> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 9fcac5ecb..942e9d307 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -126,19 +126,19 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFu typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomUnaryOp SYCLEXTRFUNCCUSTOMUNARYOP(const) SYCLEXTRFUNCCUSTOMUNARYOP() #undef SYCLEXTRFUNCCUSTOMUNARYOP - +//TensorCustomBinaryOp #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomBinaryOp SYCLEXTRFUNCCUSTOMBIBARYOP(const) SYCLEXTRFUNCCUSTOMBIBARYOP() #undef SYCLEXTRFUNCCUSTOMBIBARYOP @@ -177,7 +177,7 @@ SYCLEXTRFUNCASSIGNOP() /// specialisation of the \ref FunctorExtractor struct when the node types are /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ template <typename Expr, typename Dev>\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\ @@ -185,13 +185,16 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ : xprExpr(expr.impl()) {}\ }; //TensorEvalToOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) // TensorLayoutSwapOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) +// TensorIndexTupleOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) -#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE template<typename Dim, size_t NumOutputDim> struct DimConstr { template<typename InDim> @@ -202,10 +205,10 @@ template<typename Dim> struct DimConstr<Dim, 0> { template<typename InDim> static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} }; - +//TensorReductionOp #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\ typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ @@ -213,12 +216,39 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgTy FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ }; - - SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +//TensorTupleReducerOp +#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ +template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\ + struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\ + typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\ + static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\ + typedef typename Evaluator::StrideDims StrideDims;\ + typedef typename Evaluator::Index Index;\ + typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ + const Dimensions m_dimensions;\ + const int m_return_dim;\ + const StrideDims m_strides;\ + const Index m_stride_mod;\ + const Index m_stride_div;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\ + FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\ + : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ + m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ +}; + +SYCLEXTRFUNCTUPLEREDUCTIONOP(const) +SYCLEXTRFUNCTUPLEREDUCTIONOP() +#undef SYCLEXTRFUNCTUPLEREDUCTIONOP + +//TensorContractionOp and TensorConvolutionOp #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ @@ -230,9 +260,10 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, Rhs : m_dimensions(expr.dimensions()) {}\ }; - +//TensorContractionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +//TensorConvolutionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP @@ -255,6 +286,7 @@ SYCLEXTRFUNCTSLICEOP(const) SYCLEXTRFUNCTSLICEOP() #undef SYCLEXTRFUNCTSLICEOP +//TensorStridingSlicingOp #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ @@ -273,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const) SYCLEXTRFUNCTSLICESTRIDEOP() #undef SYCLEXTRFUNCTSLICESTRIDEOP -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template<typename Param, typename XprType, typename Dev>\ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ @@ -284,9 +316,11 @@ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprTy : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ }; +//TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) +//TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT @@ -343,6 +377,7 @@ SYCLEXTRFUNCCHIPPINGOP(const) SYCLEXTRFUNCCHIPPINGOP() #undef SYCLEXTRFUNCCHIPPINGOP +//TensorImagePatchOp #define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\ @@ -420,7 +455,6 @@ SYCLEXTRFUNCVOLUMEPATCHOP() #undef SYCLEXTRFUNCVOLUMEPATCHOP - /// template deduction function for FunctorExtractor template <typename Evaluator> auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 12237bfab..e5b892f2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -72,7 +72,7 @@ namespace internal { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { @@ -85,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -111,7 +111,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) @@ -126,8 +126,8 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -173,7 +173,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); @@ -220,7 +220,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); auto scale = (rng*red_factor) + remaining; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 330283b39..234580c7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -114,27 +114,37 @@ SYCLCUSTOMBINARYOPLEAFCOUNT() #undef SYCLCUSTOMBINARYOPLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ +#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ template <typename Expr>\ struct LeafCount<CVQual ExprNode<Expr> > {\ static const size_t Count = Num + CategoryCount<Expr>::Count;\ }; -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) -#undef EVALTOLAYOUTSWAPLEAFCOUNT +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) + +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) + +#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ +#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr>\ -struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\ +struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\ static const size_t Count =1;\ }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() +// TensorReductionOp +REDUCTIONLEAFCOUNT(const,TensorReductionOp) +REDUCTIONLEAFCOUNT(,TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) +REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) + #undef REDUCTIONLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp @@ -150,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) #undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType>\ @@ -161,7 +169,6 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp #define CHIPPINGOPLEAFCOUNT(CVQual)\ template <DenseIndex DimId, typename XprType>\ @@ -195,7 +202,6 @@ TENSORIMAGEPATCHOPLEAFCOUNT() template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{}; - TENSORVOLUMEPATCHOPLEAFCOUNT(const) TENSORVOLUMEPATCHOPLEAFCOUNT() #undef TENSORVOLUMEPATCHOPLEAFCOUNT diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 99d528963..9d5708fc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -171,19 +171,24 @@ CUSTOMBINARYOPEVAL() /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp, TensorLayoutSwapOp -#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ +/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp +#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ template <typename Expr, size_t N>\ struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\ typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\ }; -EVALTOLAYOUTSWAP(const, TensorEvalToOp) -EVALTOLAYOUTSWAP(, TensorEvalToOp) -EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) -EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) +// TensorEvalToOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) +//TensorLayoutSwapOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) +//TensorIndexTupleOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) -#undef EVALTOLAYOUTSWAP +#undef EVALTOLAYOUTSWAPINDEXTUPLE /// specialisation of the \ref PlaceHolderExpression when the node is @@ -199,17 +204,24 @@ CHIPPINGOP() #undef CHIPPINGOP /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLREDUCTION(CVQual)\ +/// TensorReductionOp and TensorTupleReducerOp (Argmax) +#define SYCLREDUCTION(CVQual, ExprNode)\ template <typename OP, typename Dims, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\ - typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\ +struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\ + typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\ }; -SYCLREDUCTION(const) -SYCLREDUCTION() + +// tensor reduction +SYCLREDUCTION(const, TensorReductionOp) +SYCLREDUCTION(, TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +SYCLREDUCTION(const, TensorTupleReducerOp) +SYCLREDUCTION(, TensorTupleReducerOp) #undef SYCLREDUCTION + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac785540..29c78184d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,7 +25,6 @@ namespace Eigen { namespace TensorSycl { - template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; @@ -38,7 +37,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx void operator()(cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); if (gId < range) device_evaluator.evalScalar(gId); |