From e2e3f785331cb90ae07b7ca7829be0ffecf6811b Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Mar 2017 17:48:15 +0000 Subject: Fixing potential race condition on sycl device. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 502 +++++++++++---------- 1 file changed, 259 insertions(+), 243 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 258218463..23297a0a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,6 +41,7 @@ namespace Eigen { size_t m_i; size_t m_offset; }; + template struct memsetkernelFunctor{ AccType m_acc; @@ -54,6 +55,21 @@ template }; +struct memsetCghFunctor{ + cl::sycl::buffer& m_buf; + const ptrdiff_t& buff_offset; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + } +}; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); @@ -75,18 +91,8 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device return devices; } -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; +class QueueInterface { +public: /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template explicit QueueInterface(const dev_Selector& s): @@ -115,155 +121,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} -//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop -template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); -} - -template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - // Assuming that the dst is the start of the destination pointer -auto it =find_buffer(src); -auto offset =static_cast(static_cast(src))- it->first; -offset/=sizeof(Index); -size_t rng, GRange, tileSize; -parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - -} - -EIGEN_STRONG_INLINE void synchronize() const { - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass -} -EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass - -} - -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(m_queue.get_device(). template get_info()); - auto s= m_queue.get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } -} - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - -EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().multiProcessorCount; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); - -// return stream_->deviceProperties().maxThreadsPerBlock; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - std::lock_guard lock(mutex_); - // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); -// return stream_->deviceProperties().maxThreadsPerMultiProcessor; -} -EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().sharedMemPerBlock; -} - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -292,23 +149,208 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { std::lock_guard lock(mutex_); buffer_map.clear(); } + //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop + /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device + /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode + /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the + /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// this buffer is accessed, the data will be copied to the device. + template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access(); + ::memcpy(host_acc.get_pointer(), src, n); + } + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl + /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the + /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination + /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data + /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back + /// to the cpu only once per function call. + template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto it =find_buffer(src); + auto offset =static_cast(static_cast(src))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access(cgh); + auto dst_acc =dest_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + } - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + /// the memcpy function + template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; + auto it1 = find_buffer(static_cast(src)); + auto it2 = find_buffer(dst); + auto offset= (static_cast(static_cast(src))) - it1->first; + auto i= (static_cast(dst)) - it2->first; + offset/=sizeof(Index); + i/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =it1->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); + }); + synchronize(); + } + + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + std::lock_guard lock(mutex_); + size_t rng, GRange, tileSize; + parallel_for_setup(n, tileSize, rng, GRange); + auto it1 = find_buffer(static_cast(data)); + ptrdiff_t buff_offset= (static_cast(data)) - it1->first; + m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); + synchronize(); + } + + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + /// the function then adds an entry by creating a sycl buffer for that particular pointer. + template EIGEN_STRONG_INLINE cl::sycl::accessor + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + std::lock_guard lock(mutex_); + return (find_buffer(ptr)->second.template get_access(cgh)); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + std::lock_guard lock(mutex_); + return find_buffer(ptr)->second; + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(mutex_); + return (static_cast(ptr))-(find_buffer(ptr)->first); + } + + EIGEN_STRONG_INLINE void synchronize() const { + m_queue.wait_and_throw(); //pass + } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled + m_queue.wait_and_throw(); //pass + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast(m_queue.get_device(). template get_info()); + auto s= m_queue.get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); } - else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } + rng = n; + if (rng==0) rng=static_cast(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); } + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -320,25 +362,52 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { // destructor ~QueueInterface() { buffer_map.clear(); } + +private: + /// class members: + bool exception_caught_ = false; + + mutable std::mutex mutex_; + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// sycl queue + mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + auto it1 = buffer_map.find(static_cast(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } }; +// Here is a sycl deviuce struct which accept the sycl queue interface +// as an input struct SyclDevice { // class member. QueueInterface* m_queue_stream; /// QueueInterface is not owned. it is the caller's responsibility to destroy it. explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. + // get sycl accessor template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access(cgh)); + return m_queue_stream->template get_sycl_accessor(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + return m_queue_stream->get_sycl_buffer(ptr); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels @@ -353,8 +422,6 @@ struct SyclDevice { m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { @@ -375,72 +442,27 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + m_queue_stream->memcpy(dst,src,n); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - auto it = m_queue_stream->find_buffer(ptr); - return (static_cast(ptr))-it->first; + return m_queue_stream->get_offset(ptr); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. +// memcpyHostToDevice template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { m_queue_stream->memcpyHostToDevice(dst,src,n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. +/// here is the memcpyDeviceToHost template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { m_queue_stream->memcpyDeviceToHost(dst,src,n); } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = m_queue_stream->find_buffer(static_cast(data)); - ptrdiff_t buff_offset= (static_cast(data)) - it1->first; - sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + m_queue_stream->memset(data,c,n); } - - struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); - } - }; + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME @@ -449,37 +471,31 @@ struct SyclDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on sycl devices. return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().multiProcessorCount; + return m_queue_stream->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info(); - - // return stream_->deviceProperties().maxThreadsPerBlock; + return m_queue_stream->maxSyclThreadsPerBlock(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); + return m_queue_stream->maxSyclThreadsPerMultiProcessor(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().sharedMemPerBlock; + return m_queue_stream->sharedMemPerBlock(); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } EIGEN_STRONG_INLINE void synchronize() const { m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the -- cgit v1.2.3