From 2d4a091beb9e55664c1475137af7166d524cbc1d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 14 Dec 2016 15:30:37 +0000 Subject: Adding tensor contraction operation backend for Sycl; adding test for contractionOp sycl backend; adding temporary solution to prevent memory leak in buffer; cleaning up cxx11_tensor_buildins_sycl.h --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 28 ++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 40dd5d81a..f92ea1d7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -31,7 +31,7 @@ namespace Eigen { auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); auto globalid = itemID.get_global_linear_id(); if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; + dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; } } @@ -50,7 +50,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos){ + if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp it=devices.erase(it); } else{ @@ -72,9 +72,9 @@ struct QueueInterface { mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured throufh dev_Selector typename + /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template explicit QueueInterface(dev_Selector s): + template explicit QueueInterface(const dev_Selector& s): #ifdef EIGEN_EXCEPTIONS m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { for (const auto& e : l) { @@ -103,17 +103,21 @@ struct QueueInterface { auto ptr =buf.get_access().get_pointer(); buf.set_final_data(nullptr); std::lock_guard lock(mutex_); - buffer_map.insert(std::pair>(ptr,buf)); + buffer_map.insert(std::pair>(static_cast(ptr),buf)); return static_cast(ptr); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(const void *p) const { + EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard lock(mutex_); auto it = buffer_map.find(static_cast(p)); if (it != buffer_map.end()) { + auto num_bytes =it->second.get_size(); buffer_map.erase(it); + // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version + std::allocator a1; // Default allocator for buffer + a1.deallocate(static_cast(p), num_bytes); } } @@ -188,7 +192,7 @@ struct SyclDevice { return m_queue_stream->allocate(num_bytes); } /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(const void *p) const { + EIGEN_STRONG_INLINE void deallocate(void *p) const { m_queue_stream->deallocate(p); } @@ -235,25 +239,25 @@ struct SyclDevice { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(rng*sizeof(T))); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { auto src_acc= it->second.template get_access(cgh); auto dst_acc =dest_buf.template get_access(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, 0)); }); synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. - template EIGEN_STRONG_INLINE void memset(T *buff, int c, size_t n) const { + template EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto buf_acc =get_sycl_buffer(static_cast(static_cast(buff))). template get_access(cgh); + auto buf_acc =get_sycl_buffer(static_cast(static_cast(data))). template get_access(cgh); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< n) { + if (globalid< rng) { for(size_t i=0; i