diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-12-14 15:30:37 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-12-14 15:30:37 +0000 |
commit | 2d4a091beb9e55664c1475137af7166d524cbc1d (patch) | |
tree | d9e4baec0be3eb3c8a4bb2451701f7e49730daa1 /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | |
parent | 3d59a477201d4d4f34b4332fda699c21387cf726 (diff) |
Adding tensor contraction operation backend for Sycl; adding test for contractionOp sycl backend; adding temporary solution to prevent memory leak in buffer; cleaning up cxx11_tensor_buildins_sycl.h
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 28 |
1 files changed, 16 insertions, 12 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 40dd5d81a..f92ea1d7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -31,7 +31,7 @@ namespace Eigen { auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); auto globalid = itemID.get_global_linear_id(); if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; + dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; } } @@ -50,7 +50,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos){ + if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp it=devices.erase(it); } else{ @@ -72,9 +72,9 @@ struct QueueInterface { mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured throufh dev_Selector typename + /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template<typename dev_Selector> explicit QueueInterface(dev_Selector s): + template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s): #ifdef EIGEN_EXCEPTIONS m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { for (const auto& e : l) { @@ -103,17 +103,21 @@ struct QueueInterface { auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); buf.set_final_data(nullptr); std::lock_guard<std::mutex> lock(mutex_); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(ptr,buf)); + buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); return static_cast<void*>(ptr); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(const void *p) const { + EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard<std::mutex> lock(mutex_); auto it = buffer_map.find(static_cast<const uint8_t*>(p)); if (it != buffer_map.end()) { + auto num_bytes =it->second.get_size(); buffer_map.erase(it); + // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version + std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1> + a1.deallocate(static_cast<uint8_t*>(p), num_bytes); } } @@ -188,7 +192,7 @@ struct SyclDevice { return m_queue_stream->allocate(num_bytes); } /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(const void *p) const { + EIGEN_STRONG_INLINE void deallocate(void *p) const { m_queue_stream->deallocate(p); } @@ -235,25 +239,25 @@ struct SyclDevice { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(rng*sizeof(T))); + auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset)); + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0)); }); synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. - template<typename T> EIGEN_STRONG_INLINE void memset(T *buff, int c, size_t n) const { + template<typename T> EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(buff))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< n) { + if (globalid< rng) { for(size_t i=0; i<sizeof(T); i++) buf_acc[globalid*sizeof(T) + i] = c; } |