From 0ebe3808ca8b2c96d9d77024ba8d4d0bdfb7e23c Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 4 Nov 2016 18:18:19 +0000 Subject: Removed the sycl include from Eigen/Core and moved it to Unsupported/Eigen/CXX11/Tensor; added TensorReduction for sycl (full reduction and partial reduction); added TensorReduction test case for sycl (full reduction and partial reduction); fixed the tile size on TensorSyclRun.h based on the device max work group size; --- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 242 +++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h new file mode 100644 index 000000000..1c89132db --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -0,0 +1,242 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorSyclPlaceHolderExpr.h + * + * \brief: + * This is the specialisation of the placeholder expression based on the + * operation type + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP + +namespace Eigen { +namespace internal { + +template struct syclGenericBufferReducer{ +template +static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ + do { + auto f = [length, local, &bufOut, &bufI](cl::sycl::handler& h) mutable { + cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, + cl::sycl::range<1>{std::min(length, local)}}; + /* Two accessors are used: one to the buffer that is being reduced, + * and a second to local memory, used to store intermediate data. */ + auto aI = + bufI.template get_access(h); + auto aOut = + bufOut.template get_access(h); + cl::sycl::accessor + scratch(cl::sycl::range<1>(local), h); + + /* The parallel_for invocation chosen is the variant with an nd_item + * parameter, since the code requires barriers for correctness. */ + h.parallel_for( + r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { + size_t globalid = id.get_global(0); + size_t localid = id.get_local(0); + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + if (globalid < length) { + scratch[localid] = aI[globalid]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + if (globalid < length) { + int min = (length < local) ? length : local; + for (size_t offset = min / 2; offset > 0; offset /= 2) { + if (localid < offset) { + scratch[localid] += scratch[localid + offset]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + } + /* The final result will be stored in local id 0. */ + if (localid == 0) { + aI[id.get_group(0)] = scratch[localid]; + if((length<=local) && globalid ==0){ + aOut[globalid]=scratch[localid]; + } + } + } + }); + }; + dev.m_queue.submit(f); + dev.m_queue.throw_asynchronous(); + + /* At this point, you could queue::wait_and_throw() to ensure that + * errors are caught quickly. However, this would likely impact + * performance negatively. */ + length = length / local; + + } while (length > 1); + + + +} + +}; + +/// For now let's start with a full reducer +/// Self is useless here because in expression construction we are going to treat reduction as a leafnode. +/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the +/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as +// a leafNode. +template +struct FullReducer { + + typedef typename Self::CoeffReturnType CoeffReturnType; + static const bool HasOptimizedImplementation = false; + + static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { + typedef const typename Self::ChildType HostExpr; /// this is the child of reduction + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); + int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. + size_t inputSize =self.impl().dimensions().TotalSize(); + size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input + size_t remaining = inputSize% red_factor; + if(rng ==0) { + red_factor=1; + }; + size_t tileSize =dev.m_queue.get_device(). template get_info()/2; + size_t GRange=std::max((size_t )1, rng); + + // convert global range to power of 2 for redecution + GRange--; + GRange |= GRange >> 1; + GRange |= GRange >> 2; + GRange |= GRange >> 4; + GRange |= GRange >> 8; + GRange |= GRange >> 16; +#if __x86_64__ || __ppc64__ || _WIN64 + GRange |= GRange >> 32; +#endif + GRange++; + size_t outTileSize = tileSize; + /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. + if (GRange < outTileSize) outTileSize=GRange; + // getting final out buffer at the moment the created buffer is true because there is no need for assign + auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); + /// creating the shared memory for calculating reduction. + /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can + /// recursively apply reduction on it in order to reduce the whole. + auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); + typedef typename Eigen::internal::remove_all::type Dims; + Dims dims= self.xprDims(); + Op functor = reducer; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { + // create a tuple of accessors from Evaluator + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); + + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + + if(globalid::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast(functor)); + else + tmp_global_accessor.get_pointer()[globalid]=static_cast(0); + + if(remaining!=0 && globalid==0 ) + // this will add the rest of input buffer when the input size is not devidable to red_factor. + tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast(functor)); + }); + }); + dev.m_queue.throw_asynchronous(); + +/// This is used to recursively reduce the tmp value to an element of 1; + syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); + } + +}; + +template +struct InnerReducer { + + typedef typename Self::CoeffReturnType CoeffReturnType; + static const bool HasOptimizedImplementation = false; + + static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { + typedef const typename Self::ChildType HostExpr; /// this is the child of reduction + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); + + size_t tileSize =dev.m_queue.get_device(). template get_info()/2; + + size_t GRange=num_coeffs_to_preserve; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + size_t xMode = GRange % tileSize; + if (xMode != 0) GRange += (tileSize - xMode); + } + // getting final out buffer at the moment the created buffer is true because there is no need for assign + /// creating the shared memory for calculating reduction. + /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can + /// recursively apply reduction on it in order to reduce the whole. + typedef typename Eigen::internal::remove_all::type Dims; + Dims dims= self.xprDims(); + Op functor = reducer; + + dev.m_queue.submit([&](cl::sycl::handler &cgh) { + // create a tuple of accessors from Evaluator + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); + + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + typedef Eigen::TensorEvaluator DeiceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + if (globalid< static_cast(num_coeffs_to_preserve)) { + typename DeiceSelf::CoeffReturnType accum = functor.initialize(); + GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast(functor), &accum); + functor.finalize(accum); + output_accessor.get_pointer()[globalid]= accum; + } + }); + }); + dev.m_queue.throw_asynchronous(); + return false; + } +}; + +} // end namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP -- cgit v1.2.3