// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** * TensorSyclPlaceHolderExpr.h * * \brief: * This is the specialisation of the placeholder expression based on the * operation type * *****************************************************************/ #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP namespace Eigen { namespace internal { template struct syclGenericBufferReducer{ template static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access(h); auto aOut =bufOut.template get_access(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor LocalAccessor; LocalAccessor scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact * performance negatively. */ length = length / local; } while (length > 1); } }; template struct syclGenericBufferReducer, CoeffReturnType>{ template static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), bufOut, out_offset, bufI, dev, length, local); } }; /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. template struct FullReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input size_t remaining = inputSize% red_factor; if(rng ==0) { red_factor=1; }; size_t tileSize =dev.sycl_queue().get_device(). template get_info()/2; size_t GRange=std::max((size_t )1, rng); // convert global range to power of 2 for redecution GRange--; GRange |= GRange >> 1; GRange |= GRange >> 2; GRange |= GRange >> 4; GRange |= GRange >> 8; GRange |= GRange >> 16; #if __x86_64__ || __ppc64__ || _WIN64 GRange |= GRange >> 32; #endif GRange++; size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all::type Dims; // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // this is a workaround for gcc 4.8 bug typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); typedef decltype(tmp_global_accessor) OutAccessor; cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), TensorSycl::internal::FullReductionKernelFunctor (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); }); dev.asynchronousExec(); // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; syclGenericBufferReducer::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; template struct InnerReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); typename Self::Index range, GRange, tileSize; typedef typename Eigen::internal::remove_all::type Dims; // getting final out buffer at the moment the created buffer is true because there is no need for assign /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // this is workaround for gcc 4.8 bug. typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(cgh, output); ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); return false; } }; } // end namespace internal } // namespace Eigen #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP