From d57430dd73ab2f88aa5e45c370f6ab91103ff18a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 8 Nov 2016 17:08:02 +0000 Subject: Converting all sycl buffers to uninitialised device only buffers; adding memcpyHostToDevice and memcpyDeviceToHost on syclDevice; modifying all examples to obey the new rules; moving sycl queue creating to the device based on Benoit suggestion; removing the sycl specefic condition for returning m_result in TensorReduction.h according to Benoit suggestion. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 115 ++++++++++----------- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 10 +- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 22 ++-- 4 files changed, 74 insertions(+), 81 deletions(-) (limited to 'unsupported/Eigen/CXX11/src') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 4231a11ff..8333301ea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,95 +16,93 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H namespace Eigen { -/// \struct BufferT is used to specialise add_sycl_buffer function for -// two types of buffer we have. When the MapAllocator is true, we create the -// sycl buffer with MapAllocator. -/// We have to const_cast the input pointer in order to work around the fact -/// that sycl does not accept map allocator for const pointer. -template -struct BufferT { - using Type = cl::sycl::buffer>; - static inline void add_sycl_buffer(const T *ptr, size_t num_bytes,std::map> &buffer_map) { - buffer_map.insert(std::pair>(ptr, std::shared_ptr(std::make_shared(Type(const_cast(ptr), cl::sycl::range<1>(num_bytes)))))); - } -}; - -/// specialisation of the \ref BufferT when the MapAllocator is false. In this -/// case we only create the device-only buffer. -template -struct BufferT { - using Type = cl::sycl::buffer; - static inline void add_sycl_buffer(const T *ptr, size_t num_bytes, std::map> &buffer_map) { - buffer_map.insert(std::pair>(ptr, std::shared_ptr(std::make_shared(Type(cl::sycl::range<1>(num_bytes)))))); - } -}; - struct SyclDevice { /// class members /// sycl queue - cl::sycl::queue &m_queue; + mutable cl::sycl::queue m_queue; /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer - /// now depends on the lifespan of SyclDevice. If a non-read-only pointer is - /// needed to be accessed on the host we should manually deallocate it. + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. mutable std::map> buffer_map; - - SyclDevice(cl::sycl::queue &q) : m_queue(q) {} + /// creating device by using selector + template SyclDevice(dev_Selector s) + :m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { + for (const auto& e : l) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + } + })) {} // destructor ~SyclDevice() { deallocate_all(); } - template - void deallocate(const T *p) const { + template void deallocate(T *p) const { auto it = buffer_map.find(p); if (it != buffer_map.end()) { buffer_map.erase(it); + internal::aligned_free(p); + } + } + void deallocate_all() const { + std::map>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast(p)); + it=buffer_map.begin(); } + buffer_map.clear(); } - void deallocate_all() const { buffer_map.clear(); } /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. - /// If found it gets the accessor from it, if not, the function then adds an - /// entry by creating a sycl buffer - /// for that particular pointer. - template - inline cl::sycl::accessor + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template inline cl::sycl::accessor get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr).template get_access(cgh)); + return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); } -template - inline typename BufferT::Type - get_sycl_buffer(size_t num_bytes,const T * ptr) const { - if(MapAllocator && !ptr){ - eigen_assert("pointer with map_Allocator cannot be null. Please initialise the input pointer"); } - auto it = buffer_map.find(ptr); - if (it == buffer_map.end()) { - BufferT::add_sycl_buffer(ptr, num_bytes, buffer_map); - } - return (*((typename BufferT::Type*)((buffer_map.at(ptr).get())))); + template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer; + std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast(dataMem); }))); + (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; + } + + template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); } /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); } // some runtime conditions that can be applied here bool isDeviceSuitable() const { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void *buffer) const { - internal::aligned_free(buffer); - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { ::memcpy(dst, src, n); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n) const { - memcpy(dst, src, n); + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); + memcpy(host_acc.get_pointer(), src, n); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n) const { - memcpy(dst, src, n); + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); + } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { ::memset(buffer, c, n); } @@ -112,6 +110,7 @@ template return 1; } }; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 367bccf63..f731bf17e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -662,13 +662,7 @@ struct TensorEvaluator, } } - /// required by sycl in order to extract the output accessor -#ifndef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return NULL; } -#else - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { - return m_result; } -#endif + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } /// required by sycl in order to extract the accessor const TensorEvaluator& impl() const { return m_impl; } /// added for sycl in order to construct the buffer from the sycl device diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 1c89132db..3daecb045 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,9 +27,9 @@ namespace internal { template struct syclGenericBufferReducer{ template -static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -37,7 +37,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de auto aI = bufI.template get_access(h); auto aOut = - bufOut.template get_access(h); + bufOut->template get_access(h); cl::sycl::accessor scratch(cl::sycl::range<1>(local), h); @@ -134,7 +134,7 @@ struct FullReducer { /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); + auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. @@ -208,7 +208,7 @@ struct InnerReducer { dev.m_queue.submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); + auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 3af5f8cfc..b1da6858e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -56,10 +56,10 @@ struct AccessorConstructor{ -> decltype(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3)))) { return utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3))); } - template< cl::sycl::access::mode AcM, bool MapAllocator, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval) - -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval) + -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){ - return utility::tuple::make_tuple(eval.device().template get_sycl_accessor::type>(eval.dimensions().TotalSize(), cgh,eval.data())); + return utility::tuple::make_tuple(eval.device().template get_sycl_accessor::type>(eval.dimensions().TotalSize(), cgh,eval.data())); } }; @@ -141,8 +141,8 @@ struct ExtractAccessor, Dev> > template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev> eval)\ - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ - return AccessorConstructor::template getAccessor(cgh, eval);\ + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ + return AccessorConstructor::template getAccessor(cgh, eval);\ }\ }; TENSORMAPEXPR(const, cl::sycl::access::mode::read) @@ -153,8 +153,8 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write) template struct ExtractAccessor, Dev> > { static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ - return AccessorConstructor::template getAccessor(cgh, eval); + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ + return AccessorConstructor::template getAccessor(cgh, eval); } }; @@ -167,8 +167,8 @@ struct ExtractAccessor, Dev> > template struct ExtractAccessor, Dev> > { static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev> eval) - -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){ - return utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())); + -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){ + return utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())); } }; @@ -181,8 +181,8 @@ struct ExtractAccessor, Dev> > template struct ExtractAccessor, Dev> > { static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ - return AccessorConstructor::template getAccessor(cgh, eval); + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ + return AccessorConstructor::template getAccessor(cgh, eval); } }; -- cgit v1.2.3