diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-01-19 11:30:59 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-01-19 11:30:59 +0000 |
commit | 6bdd15f572c0b8cd21f5acba3671d536f50a9b53 (patch) | |
tree | 8343c43748cfbdefdac6e7b4e52aec7196669589 /unsupported/Eigen/CXX11 | |
parent | e46e7223817cfd982edec6d8e25c77e8e2493d78 (diff) |
Adding non-deferrenciable pointer track for ComputeCpp backend; Adding TensorConvolutionOp for ComputeCpp; fixing typos. modifying TensorDeviceSycl to use the LegacyPointer class.
Diffstat (limited to 'unsupported/Eigen/CXX11')
12 files changed, 913 insertions, 140 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index abdf742c6..378f5cccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { + if (static_cast<size_t>(i + 1) < offset) { m_cudaInputStrides[i] = m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_cudaOutputStrides[i] = diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 000000000..7774342d8 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,476 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel1D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize, range_x, range_y; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<2> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; + const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); + /// fill the shared memory + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset ; + const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); + if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const int first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + const size_t index = plane_kernel_offset+ itemID.get_local(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel2D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); + const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; + + /// fill the shared memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); + if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + for (size_t j = 0; j < kernelSize_y; j++) { + size_t kernel_offset =kernelSize_x * j; + const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); + for (size_t i = 0; i < kernelSize_x; i++) { + result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + + +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel3D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , + const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), + kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; + for(size_t p=0; p<numP; p++){ + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) { + for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + (num_x_input * (j + (num_y_input * k))); + const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start ); + if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x + const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y + const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z + + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + for (size_t k = 0; k < kernelSize_z; k++) { + for (size_t j = 0; j < kernelSize_y; j++) { + for (size_t i = 0; i < kernelSize_x; i++) { + const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); + const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + + enum { + IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value; + internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { + typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + + typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; + // extract input functor list + InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + + const unsigned long maxSharedMem = m_device.sharedMemPerBlock(); // sycl localmemory size + m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { + + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc; + /// work-around for gcc 4.8 auto bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType; + // create input tuple of accessors + InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); + + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; + KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); + + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; + m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); + const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); + assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range + auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array<Index, 1> indices{m_indices[0]}; + const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), + EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 2: { + const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numP = dimensions().TotalSize() / (numX*numY); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; + assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}}; + const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 3: { + const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numZ = dimensions()[m_indices[idxZ]]; + const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); + const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; + const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); + assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, + numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + }); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 268d9d148..ae8a9f667 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,16 +15,16 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include "TensorSyclLegacyPointer.h" + namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) - template <typename Scalar> class MemCopyFunctor { + template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { public: - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) + : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} void operator()(cl::sycl::nd_item<1> itemID) { auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); @@ -55,6 +55,7 @@ namespace Eigen { }; + EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector<cl::sycl::device>::iterator it =devices.begin(); @@ -77,11 +78,10 @@ struct QueueInterface { bool exception_caught_ = false; mutable std::mutex mutex_; - /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; + //mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename @@ -119,49 +119,42 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); - buf.set_final_data(nullptr); std::lock_guard<std::mutex> lock(mutex_); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); - return static_cast<void*>(ptr); + return codeplay::legacy::malloc(num_bytes); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard<std::mutex> lock(mutex_); - auto it = buffer_map.find(static_cast<const uint8_t*>(p)); - if (it != buffer_map.end()) { - auto num_bytes =it->second.get_size(); - buffer_map.erase(it); - // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version - std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1> - a1.deallocate(static_cast<uint8_t*>(p), num_bytes); - } + return codeplay::legacy::free(p); } EIGEN_STRONG_INLINE void deallocate_all() const { std::lock_guard<std::mutex> lock(mutex_); - buffer_map.clear(); + codeplay::legacy::clear(); } - EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const { + EIGEN_STRONG_INLINE codeplay::legacy::PointerMapper& pointerMapper() const { std::lock_guard<std::mutex> lock(mutex_); - auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl; - abort(); + return codeplay::legacy::getPointerMapper(); + } + + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t,1> get_buffer(void* ptr) const { + std::lock_guard<std::mutex> lock(mutex_); + return pointerMapper().get_buffer(pointerMapper().get_buffer_id(ptr)); } + EIGEN_STRONG_INLINE size_t get_buffer_offset(void* ptr) const { + std::lock_guard<std::mutex> lock(mutex_); + return pointerMapper().get_offset(ptr); + } + + /*EIGEN_STRONG_INLINE void* get_buffer_id(void* ptr) const { + std::lock_guard<std::mutex> lock(mutex_); + return static_cast<void*>(pointerMapper().get_buffer_id(ptr)); + }*/ + // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -172,7 +165,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } // destructor - ~QueueInterface() { buffer_map.clear(); } + ~QueueInterface() { codeplay::legacy::clear(); } }; struct SyclDevice { @@ -190,14 +183,20 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1> get_sycl_buffer(const void * ptr) const { + return m_queue_stream->get_buffer(const_cast<void*>(ptr)); } + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2); + tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); + auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast<size_t>(256), static_cast<size_t>(tileSize)); + } rng = n; if (rng==0) rng=static_cast<Index>(1); GRange=rng; @@ -207,6 +206,76 @@ struct SyclDevice { if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); } } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size)); + } + size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size)); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } + } + + + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size)); + } + size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size)); + tileSize2 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast<Index>(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast<Index>(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); + } + pow_of_2 = static_cast<size_t>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } + } + + /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -220,21 +289,21 @@ struct SyclDevice { EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } /// the memcpy function - template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer((void*)src); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; - auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(T); - i/=sizeof(T); + template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + auto offset= m_queue_stream->get_buffer_offset((void*)src); + auto i= m_queue_stream->get_buffer_offset(dst); + offset/=sizeof(Index); + i/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset)); + auto src_acc =get_sycl_accessor<cl::sycl::access::mode::read>(cgh, src); + auto dst_acc =get_sycl_accessor<cl::sycl::access::mode::write>(cgh, dst); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); }); - asynchronousExec(); + synchronize(); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -246,26 +315,28 @@ struct SyclDevice { auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); ::memcpy(host_acc.get_pointer(), src, n); } + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. - template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(T); + template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + auto offset =m_queue_stream->get_buffer_offset((void *)src); + offset/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto src_acc= get_sycl_accessor<cl::sycl::access::mode::read>(cgh, src); auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset)); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); }); - asynchronousExec(); + synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -273,8 +344,9 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); - asynchronousExec(); + auto buf =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))); + sycl_queue().submit(memsetCghFunctor(buf,rng, GRange, tileSize, c )); + synchronize(); } struct memsetCghFunctor{ @@ -300,6 +372,24 @@ struct SyclDevice { // there is no l3 cache on cuda devices. return firstLevelCacheSize(); } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); + // return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + + // return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); + // return stream_->deviceProperties().sharedMemPerBlock; + } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } @@ -308,7 +398,7 @@ struct SyclDevice { } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled sycl_queue().wait_and_throw(); //pass diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 822e22c2d..abe85c860 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -143,12 +143,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} private: TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index dbe11c7af..6ddd2ca18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -736,22 +736,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i])); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i])); - #endif }else{ /* implies m_strides[i]<0 by assert */ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1)); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1)); - #endif } m_startIndices[i] = startIndicesClamped[i]; } @@ -867,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { +#ifndef __SYCL_DEVICE_ONLY__ return numext::maxi(min, numext::mini(max,value)); +#else + return cl::sycl::clamp(value, min, max); +#endif } array<Index, NumDims> m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 82ca71215..8ecef59a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -108,7 +108,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a work around for gcc bug + // this is a workaround for gcc 4.8 bug typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); @@ -148,7 +148,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is work around for gcc bug. + // this is workaround for gcc 4.8 bug. typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 93615e5c2..e846257a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -121,11 +121,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { -#ifndef __SYCL_DEVICE_ONLY__ - m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); -#else - m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); -#endif + m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -233,8 +229,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> /// required by sycl in order to extract the accessor Strides functor() const { return m_strides; } - - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -299,10 +293,9 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> } /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } /// required by sycl in order to extract the accessor - Strides functor() const { return this->m_strides; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 2e61ee049..84f660597 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -80,6 +80,9 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +/// this is used for extracting tensor convolution +#include "TensorConvolutionSycl.h" + // kernel execution using fusion #include "TensorSyclRun.h" //sycl functors diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 29f362ade..c0bcf26cd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -97,7 +97,7 @@ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp #define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index e4658eda5..3fd607941 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -35,6 +35,8 @@ namespace Eigen { namespace TensorSycl { namespace internal { +#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} + /// \struct ExtractAccessor: Extract Accessor Class is used to extract the /// accessor from a buffer. /// Depending on the type of the leaf node we can get a read accessor or a @@ -44,22 +46,16 @@ struct ExtractAccessor; struct AccessorConstructor{ template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) { - return ExtractAccessor<Arg>::getTuple(cgh, eval); - } + RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval)) template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) { - return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) + template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) { - return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) + template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){ - return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())); - } + RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))) }; /// specialisation of the \ref ExtractAccessor struct when the node type is @@ -68,9 +64,7 @@ struct AccessorConstructor{ template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ +RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLUNARYCATEGORYEXTACC(const) @@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC() template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLBINARYCATEGORYEXTACC(const) @@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC() template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\ - return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ }; SYCLTERNARYCATEGORYEXTACC(const) @@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC() template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ }; SYCLSELECTOPEXTACC(const) @@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC() template <typename LHSExpr, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLTENSORASSIGNOPEXTACC(const) @@ -142,9 +128,7 @@ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, template <typename PlainObjectType, int Options_, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\ }; TENSORMAPEXPR(const, cl::sycl::access::mode::read) @@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write) template <typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; SYCLFORCEDEVALEXTACC(const) @@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC() template <typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\ - -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\ - return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\ - }\ + RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ }; SYCLEVALTOEXTACC(const) @@ -185,23 +165,19 @@ SYCLEVALTOEXTACC() template <typename OP, typename Dim, typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; SYCLREDUCTIONEXTACC(const) SYCLREDUCTIONEXTACC() #undef SYCLREDUCTIONEXTACC -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp #define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) @@ -212,27 +188,24 @@ SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) /// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. This is a special case where there is no OP +/// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICEOPEXTACC(const) SYCLSLICEOPEXTACC() #undef SYCLSLICEOPEXTACC - +// specialisation of the \ref ExtractAccessor struct when the node type is +/// const TensorStridingSlicingOp. #define SYCLSLICESTRIDEOPEXTACC(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICESTRIDEOPEXTACC(const) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h new file mode 100644 index 000000000..b723592cd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h @@ -0,0 +1,244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Ruyman Reyes Codeplay Software Ltd +// Mehdi Goli Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorSyclLegacyPointer.h + * + * \brief: + * Interface for SYCL buffers to behave as a non-deferrenciable pointer + * This can be found in Codeplay's ComputeCpp SDK : legacy_pointer.h + * + **************************************************************************/ + +namespace codeplay { +namespace legacy { + +/** + * PointerMapper + * Associates fake pointers with buffers. + * + */ +class PointerMapper { + public: + /* pointer information definitions + */ + static const unsigned long ADDRESS_BITS = sizeof(void *) * 8; + static const unsigned long BUFFER_ID_BITSIZE = 16u; + static const unsigned long MAX_NUMBER_BUFFERS = (1UL << BUFFER_ID_BITSIZE)-1; + static const unsigned long MAX_OFFSET = (1UL << (ADDRESS_BITS - BUFFER_ID_BITSIZE))-1; + + using base_ptr_t = uintptr_t; + + /* Fake Pointers are constructed using an integer indexing plus + * the offset: + * + * |== MAX_BUFFERS ==|======== MAX_OFFSET ========| + * | Buffer Id | Offset in buffer | + * |=================|============================| + */ + struct legacy_pointer_t { + /* Type for the pointers + */ + base_ptr_t _contents; + + /** Conversions from legacy_pointer_t to + * the void * should just reinterpret_cast the integer + * number + */ + operator void *() const { return reinterpret_cast<void *>(_contents); } + + /** + * Convert back to the integer number. + */ + operator base_ptr_t() const { return _contents; } + + /** + * Converts a void * into a legacy pointer structure. + * Note that this will only work if the void * was + * already a legacy_pointer_t, but we have no way of + * checking + */ + legacy_pointer_t(void *ptr) + : _contents(reinterpret_cast<base_ptr_t>(ptr)){}; + + /** + * Creates a legacy_pointer_t from the given integer + * number + */ + legacy_pointer_t(base_ptr_t u) : _contents(u){}; + }; + + /* Whether if a pointer is null or not. + * + * A pointer is nullptr if the buffer id is 0, + * i.e the first BUFFER_ID_BITSIZE are zero + */ + static inline bool is_nullptr(legacy_pointer_t ptr) { + return ((MAX_OFFSET & ptr) == ptr); + } + + /* Base nullptr + */ + const legacy_pointer_t null_legacy_ptr = nullptr; + + /* Data type to create buffer of byte-size elements + */ + using buffer_data_type = uint8_t; + + /* basic type for all buffers + */ + using buffer_t = cl::sycl::buffer<buffer_data_type, 1>; + + /* id of a buffer in the map + */ + typedef short buffer_id; + + /* get_buffer_id + */ + inline buffer_id get_buffer_id(legacy_pointer_t ptr) const { + return ptr >> (ADDRESS_BITS - BUFFER_ID_BITSIZE); + } + + /* + * get_buffer_offset + */ + inline off_t get_offset(legacy_pointer_t ptr) const { + return ptr & MAX_OFFSET;; + } + + /** + * Constructs the PointerMapper structure. + */ + PointerMapper() + : __pointer_list{}, rng_(std::random_device()()), uni_(1, 256){}; + + /** + * PointerMapper cannot be copied or moved + */ + PointerMapper(const PointerMapper &) = delete; + + /** + * empty the pointer list + */ + inline void clear() { + __pointer_list.clear(); + } + + /* generate_id + * Generates a unique id for a buffer. + */ + buffer_id generate_id() { + // Limit the number of attempts to half the combinations + // just to avoid an infinite loop + int numberOfAttempts = 1ul << (BUFFER_ID_BITSIZE / 2); + buffer_id bId; + do { + bId = uni_(rng_); + } while (__pointer_list.find(bId) != __pointer_list.end() && + numberOfAttempts--); + return bId; + } + + /* add_pointer. + * Adds a pointer to the map and returns the fake pointer id. + * This will be the bufferId on the most significant bytes and 0 elsewhere. + */ + legacy_pointer_t add_pointer(buffer_t &&b) { + auto nextNumber = __pointer_list.size(); + buffer_id bId = generate_id(); + __pointer_list.emplace(bId, b); + if (nextNumber > MAX_NUMBER_BUFFERS) { + return null_legacy_ptr; + } + base_ptr_t retVal = bId; + retVal <<= (ADDRESS_BITS - BUFFER_ID_BITSIZE); + return retVal; + } + + /* get_buffer. + * Returns a buffer from the map using the buffer id + */ + buffer_t get_buffer(buffer_id bId) const { + auto it = __pointer_list.find(bId); + if (it != __pointer_list.end()) + return it->second; + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } + + /* remove_pointer. + * Removes the given pointer from the map. + */ + void remove_pointer(void *ptr) { + buffer_id bId = this->get_buffer_id(ptr); + __pointer_list.erase(bId); + } + + /* count. + * Return the number of active pointers (i.e, pointers that + * have been malloc but not freed). + */ + size_t count() const { return __pointer_list.size(); } + + private: + /* Maps the buffer id numbers to the actual buffer + * instances. + */ + std::map<buffer_id, buffer_t> __pointer_list; + + /* Random number generator for the buffer ids + */ + std::mt19937 rng_; + + /* Random-number engine + */ + std::uniform_int_distribution<short> uni_; +}; + +/** + * Singleton interface to the pointer mapper to implement + * the generic malloc/free C interface without extra + * parameters. + */ +inline PointerMapper &getPointerMapper() { + static PointerMapper thePointerMapper; + return thePointerMapper; +} + +/** + * Malloc-like interface to the pointer-mapper. + * Given a size, creates a byte-typed buffer and returns a + * fake pointer to keep track of it. + */ +inline void *malloc(size_t size) { + // Create a generic buffer of the given size + auto thePointer = getPointerMapper().add_pointer( + PointerMapper::buffer_t(cl::sycl::range<1>{size})); + // Store the buffer on the global list + return static_cast<void *>(thePointer); +} + +/** + * Free-like interface to the pointer mapper. + * Given a fake-pointer created with the legacy-pointer malloc, + * destroys the buffer and remove it from the list. + */ +inline void free(void *ptr) { getPointerMapper().remove_pointer(ptr); } + +/** + *clear the pointer list + */ +inline void clear() { + getPointerMapper().clear(); +} + +} // legacy +} // codeplay diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 6ce41b0ab..94692be56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -49,7 +49,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. -/// std::array does not have TotalSize. So I have to get the size throgh template specialisation. +/// std::array does not have TotalSize. So I have to get the size through template specialisation. template<typename Index, typename Dimensions> struct DimensionSize{ static Index getDimSize(const Dimensions& dim){ return dim.TotalSize(); |