From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 28 Jun 2019 10:08:23 +0100 Subject: [SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL. * Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1155 +++++++++++++------- 1 file changed, 780 insertions(+), 375 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e7beb2c82..93efe2f82 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,498 +14,879 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -template struct CheckAlignStatically { - static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); -}; -template -struct Conditional_Allocate { +#include - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) { - return aligned_alloc(Align, elements); - } -}; -template -struct Conditional_Allocate { +namespace Eigen { - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ - return malloc(elements); - } +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info()), + platform_name(queue.get_device() + .get_platform() + .template get_info()), + device_name(queue.get_device() + .template get_info()), + device_vendor( + queue.get_device() + .template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; }; -template > -struct SyclAllocator { - typedef Scalar value_type; - typedef typename std::allocator_traits::pointer pointer; - typedef typename std::allocator_traits::size_type size_type; - SyclAllocator( ){}; - Scalar* allocate(std::size_t elements) { - return static_cast(Conditional_Allocate::Val, Align>::conditional_allocate(elements)); +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } } - void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } -}; - -namespace Eigen { + return supported_devices; +#endif +} -#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast(static_cast(((buf_acc.get_pointer().get())))) -#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task(m_prog.get_kernel(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} - template class MemCopyFunctor { - public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + std::lock_guard lock(pmapper_mutex_); + return static_cast(pMapper.add_pointer(buf)); + } - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - -template - struct memsetkernelFunctor{ - AccType m_acc; - const ptrdiff_t buff_offset; - const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} - void operator()(cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; - } - - }; - -struct memsetCghFunctor{ - cl::sycl::buffer >& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); } -}; -//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) -EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ -std::vector supported_devices; -auto plafrom_list =cl::sycl::platform::get_platforms(); -for(const auto& platform : plafrom_list){ - auto device_list = platform.get_devices(); - auto platform_name =platform.template get_info(); - std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); - for(const auto& device : device_list){ - auto vendor = device.template get_info(); - std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) || - (device.is_gpu() && platform_name.find("intel")!=std::string::npos); - if(!unsuported_condition){ - std::cout << "Platform name "<< platform_name << std::endl; - supported_devices.push_back(device); + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; } +#endif + std::lock_guard lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } -} -return supported_devices; -} -class QueueInterface { -public: - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template explicit QueueInterface(const dev_Selector& s): -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; } - } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } - })) #else -m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - std::cerr << "Error detected Inside Sycl Device."<< std::endl; - - } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); +#endif + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor(data); } -})) + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return static_cast(data.get_virtual_pointer()); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); #endif - {} - - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. - /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key - /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we - /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. - /// The device pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - std::lock_guard lock(mutex_); - auto buf = cl::sycl::buffer >(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access().get_pointer(); - buf.set_final_data(nullptr); - buffer_map.insert(std::pair > >(static_cast(ptr),buf)); - return static_cast(ptr); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &p) const { + deallocate_temp(p.get_virtual_pointer()); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard lock(mutex_); - auto it = buffer_map.find(static_cast(p)); - if (it != buffer_map.end()) { - buffer_map.erase(it); - } + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); } EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard lock(mutex_); - buffer_map.clear(); - } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with write mode - /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. - /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark - /// Also, this is faster as it uses the map_allocator instead of memcpy - template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto it =find_buffer(dst); - auto offset =static_cast(static_cast(dst))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto src_buf = cl::sycl::buffer >(static_cast(static_cast(const_cast(src))), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto dst_acc= it->second.template get_access(cgh); - auto src_acc =src_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, offset, 0)); - }); - synchronize(); - } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it =find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - } - - /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = find_buffer(static_cast(src)); - auto it2 = find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif } + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + write_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + read_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor(cgh, src, n); + buffer_scalar_t *ptr = static_cast(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor(cgh, src, n); + auto dst_acc = get_range_accessor(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = find_buffer(static_cast(data)); - ptrdiff_t buff_offset= (static_cast(data)) - it1->first; - m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast(static_cast(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast(typed_offset), + reinterpret_cast(start_ptr)); + } + + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. - template EIGEN_STRONG_INLINE cl::sycl::accessor - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (find_buffer(ptr)->second.template get_access(cgh)); + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access( + cgh); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { - return find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return (static_cast(ptr))-(find_buffer(ptr)->first); + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_offset(ptr); } EIGEN_STRONG_INLINE void synchronize() const { - m_queue.wait_and_throw(); //pass +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif } - EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled - m_queue.wait_and_throw(); //pass + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif } - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(m_queue.get_device(). template get_info()); - auto s= m_queue.get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); if (xMode != 0) GRange += static_cast(tileSize - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); + tileSize1 = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); + tileSize0 = static_cast(max_workgroup_Size / tileSize1); rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); + tileSize2 = + static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + rng2 = dim2; + if (rng2 == 0) rng1 = static_cast(1); + GRange2 = rng2; + if (tileSize2 > GRange2) + tileSize2 = GRange2; + else if (GRange2 > tileSize2) { + Index xMode = static_cast(GRange2 % tileSize2); if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); + pow_of_2 = static_cast( + std::log2(static_cast(max_workgroup_Size / tileSize2))); + tileSize1 = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + tileSize0 = + static_cast(max_workgroup_Size / (tileSize1 * tileSize2)); rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue.get_device(). template get_info(); + return m_device_info.max_compute_units; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue.get_device(). template get_info(); + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept + // OpenCL doesnot have such concept return 2; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue.get_device(). template get_info(); + return m_device_info.local_mem_size; } - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { if (!exception_caught_) { - m_queue.wait_and_throw(); + synchronize(); } return !exception_caught_; } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + // destructor - ~QueueInterface() { buffer_map.clear(); } + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list l) const { + bool exception_caught = false; + for (const auto &e : l) { + if (e) { + exception_caught = true; +#ifdef EIGEN_EXCEPTIONS + try { + std::rethrow_exception(e); + } catch (const cl::sycl::exception &e) { + std::cerr << e.what() << std::endl; + } +#else + std::cerr << "Error detected inside Sycl device." << std::endl; + abort(); +#endif + } + } + return exception_caught; + } -private: /// class members: bool exception_caught_ = false; - mutable std::mutex mutex_; + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map latest_events_; +#endif /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map > > buffer_map; + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set scratch_buffers; +#endif /// sycl queue mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif - EIGEN_STRONG_INLINE std::map > >::iterator find_buffer(const void* ptr) const { - std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; } }; -// Here is a sycl deviuce struct which accept the sycl queue interface +// Here is a sycl device struct which accept the sycl queue interface // as an input -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor(ptr); + } // get sycl accessor - template EIGEN_STRONG_INLINE cl::sycl::accessor - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return m_queue_stream->template get_sycl_accessor(cgh, ptr); + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->get_sycl_buffer(ptr); + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, + rng1, GRange0, GRange1); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, + tileSize2, rng0, rng1, rng2, GRange0, + GRange1, GRange2); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); - - } - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return m_queue_stream->allocate(num_bytes); - } /// deallocate device memory EIGEN_STRONG_INLINE void deallocate(void *p) const { - m_queue_stream->deallocate(p); - } + queue_stream()->deallocate(p); + } - // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } - /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpy(dst,src,n); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return queue_stream()->get(data); } + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return m_queue_stream->get_offset(ptr); + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); } -// memcpyHostToDevice - template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyHostToDevice(dst,src,n); + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); } -/// here is the memcpyDeviceToHost - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyDeviceToHost(dst,src,n); + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); } - /// Here is the implementation of memset function on sycl. + /// the memset function EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - m_queue_stream->memset(data,c,n); + queue_stream()->memset(data, c, n); } /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and @@ -513,40 +894,64 @@ struct SyclDevice { return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue_stream->getNumSyclMultiProcessors(); + return queue_stream()->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue_stream->maxSyclThreadsPerBlock(); + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept - return m_queue_stream->maxSyclThreadsPerMultiProcessor(); - // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue_stream->sharedMemPerBlock(); + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } EIGEN_STRONG_INLINE void synchronize() const { - m_queue_stream->synchronize(); //pass + queue_stream()->synchronize(); } - - EIGEN_STRONG_INLINE void asynchronousExec() const { - m_queue_stream->asynchronousExec(); + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); } + // This function checks if the runtime recorded an error for the // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - return m_queue_stream->ok(); + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); } }; -// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. -// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator -// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation -struct SyclKernelDevice:DefaultDevice{}; - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -- cgit v1.2.3