diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-11-14 17:51:57 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2016-11-14 17:51:57 +0000 |
commit | f8ca893976316df46791d2f088552fb2aea419bb (patch) | |
tree | 85a25dfa9e9e669334f5120e8085e70f1b2e3a3e /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | |
parent | a5c3f15682299495f98b6f5480c798fd3211f590 (diff) |
Adding TensorFixsize; adding sycl device memcpy; adding insial stage of slicing.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 67 |
1 files changed, 52 insertions, 15 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index b2ddea2ba..7f0f16de3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -123,9 +123,45 @@ struct SyclDevice { // some runtime conditions that can be applied here EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + template <typename T> EIGEN_STRONG_INLINE std::map<const void *, std::shared_ptr<void>>::iterator find_nearest(const T* ptr) const { + auto it1 = buffer_map.find(ptr); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = ((cl::sycl::buffer<T, 1>*)it->second.get())->get_size(); + if((static_cast<const T*>(it->first) < ptr) && (ptr < (static_cast<const T*>(it->first)) + size)) return it; + } + } + return buffer_map.end(); + } + /// the memcpy function - EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); + template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const { + auto it1 = find_nearest(src); + auto it2 = find_nearest(static_cast<T*>(dst)); + if ((it1 != buffer_map.end()) && (it2!=buffer_map.end())) { + auto offset= (src - (static_cast<const T*>(it1->first))); + auto i= ((static_cast<T*>(dst)) - const_cast<T*>((static_cast<const T*>(it2->first)))); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =((cl::sycl::buffer<T, 1>*)it1->second.get())-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =((cl::sycl::buffer<T, 1>*)it2->second.get())-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) DevToDev; + cgh.parallel_for<DevToDev>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { + auto globalid=itemID.get_global_linear_id(); + if (globalid< rng) { + dst_acc[globalid+i ]=src_acc[globalid+offset]; + } + }); + }); + m_queue.throw_asynchronous(); + } else{ + eigen_assert("no source or destination device memory found."); + } + //::memcpy(dst, src, n); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -136,7 +172,7 @@ struct SyclDevice { template<typename T> EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { auto host_acc= get_sycl_buffer(n, dst)-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); - memcpy(host_acc.get_pointer(), src, n); + ::memcpy(host_acc.get_pointer(), src, n); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -145,21 +181,22 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); + auto it = find_nearest(src); + auto offset = src- (static_cast<const T*>(it->first)); if (it != buffer_map.end()) { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); - + // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer<T, 1, cl::sycl::map_allocator<T>>(dst, cl::sycl::range<1>(rng)); typedef decltype(dest_buf) SYCLDTOH; m_queue.submit([&](cl::sycl::handler &cgh) { auto src_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); cgh.parallel_for<SYCLDTOH>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< dst_acc.get_size()) { - dst_acc[globalid] = src_acc[globalid]; - } + auto globalid=itemID.get_global_linear_id(); + if (globalid< dst_acc.get_size()) { + dst_acc[globalid] = src_acc[globalid + offset]; + } }); }); m_queue.throw_asynchronous(); @@ -176,12 +213,12 @@ struct SyclDevice { m_queue.submit([&](cl::sycl::handler &cgh) { auto buf_acc =get_sycl_buffer(n, buff)-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - auto buf_ptr= reinterpret_cast<typename cl::sycl::global_ptr<unsigned char>::pointer_t>((&(*buf_acc.get_pointer()))); - if (globalid< buf_acc.get_size()) { - for(size_t i=0; i<sizeof(T); i++) - buf_ptr[globalid*sizeof(T) + i] = c; - } + auto globalid=itemID.get_global_linear_id(); + auto buf_ptr= reinterpret_cast<typename cl::sycl::global_ptr<unsigned char>::pointer_t>((&(*buf_acc.get_pointer()))); + if (globalid< buf_acc.get_size()) { + for(size_t i=0; i<sizeof(T); i++) + buf_ptr[globalid*sizeof(T) + i] = c; + } }); }); m_queue.throw_asynchronous(); |