From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Fri, 28 Jun 2019 10:08:23 +0100
Subject: [SYCL] This PR adds the minimum modifications to the Eigen
 unsupported module required to run it on devices supporting SYCL. *
 Abstracting the pointer type so that both SYCL memory and pointer can be
 captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen
 evaluator class. * Binding SYCL placeholder accessor to command group handler
 by using bind method in Eigen evaluator node. * Adding SYCL macro for
 controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL
 executor method to adopt the above changes.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h      | 1155 +++++++++++++-------
 1 file changed, 780 insertions(+), 375 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e7beb2c82..93efe2f82 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,498 +14,879 @@
 
 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
-template<size_t Align> struct CheckAlignStatically {
-  static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *)));
-};
-template <bool IsAligned, size_t Align>
-struct Conditional_Allocate {
+#include <unordered_set>
 
-  EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) {
-    return aligned_alloc(Align, elements);
-  }
-};
-template <size_t Align>
-struct Conditional_Allocate<false, Align> {
+namespace Eigen {
 
-  EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){
-    return malloc(elements);
-  }
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_item_sizes>()),
+        max_mem_alloc_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device()
+                              .template get_info<
+                                  cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device()
+                          .get_platform()
+                          .template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device()
+                        .template get_info<cl::sycl::info::device::name>()),
+        device_vendor(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
 };
-template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>>
-struct SyclAllocator {
-  typedef Scalar value_type;
-  typedef typename std::allocator_traits<Allocator>::pointer pointer;
-  typedef typename std::allocator_traits<Allocator>::size_type size_type;
 
-  SyclAllocator( ){};
-  Scalar* allocate(std::size_t elements) {
-    return static_cast<Scalar*>(Conditional_Allocate<CheckAlignStatically<Align>::Val, Align>::conditional_allocate(elements));
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and 
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently 
+// TensorFlow via the Eigen SYCL Backend. 
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+    -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name =
+        platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(),
+                   platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition =
+          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+           vendor.find("apu") == std::string::npos) ||
+          (platform_name.find("experimental") != std::string::npos) ||
+          device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
   }
-  void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); }
-};
-
-namespace Eigen {
+  return supported_devices;
+#endif
+}
 
-#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast<Scalar*>(static_cast<void*>(((buf_acc.get_pointer().get()))))
-#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+    m_prog.build_with_kernel_type<DeviceOrSelector>();
+    auto f = [&](cl::sycl::handler &cgh) {
+      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+                                        [=]() {})
+    };
+    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+  }
 
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(dev_or_sel,
+                       [this](cl::sycl::exception_list l) {
+                         this->exception_caught_ = this->sycl_async_handler(l);
+                       },
+                       num_threads) {}
 
-  template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
-  public:
-    MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
 
-    void operator()(cl::sycl::nd_item<1> itemID) {
-      auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
-      auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
-      auto globalid = itemID.get_global_linear_id();
-      if (globalid < m_rng) {
-        dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
-      }
-    }
+  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return static_cast<void *>(pMapper.add_pointer(buf));
+  }
 
-  private:
-    read_accessor m_src_acc;
-    write_accessor m_dst_acc;
-    size_t m_rng;
-    size_t m_i;
-    size_t m_offset;
-  };
-
-template<typename AccType>
-  struct memsetkernelFunctor{
-   AccType m_acc;
-   const ptrdiff_t buff_offset;
-   const size_t m_rng, m_c;
-   memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc),  buff_offset(buff_offset_), m_rng(rng), m_c(c){}
-   void operator()(cl::sycl::nd_item<1> itemID) {
-     auto globalid=itemID.get_global_linear_id();
-     if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
-   }
-
-  };
-
-struct memsetCghFunctor{
-  cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf;
-  const ptrdiff_t& buff_offset;
-  const size_t& rng , GRange, tileSize;
-  const int  &c;
-  memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff,  const ptrdiff_t& buff_offset_, const size_t& rng_,  const size_t& GRange_,  const size_t& tileSize_, const int& c_)
-  :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
-  void operator()(cl::sycl::handler &cgh) const {
-    auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
-    typedef decltype(buf_acc) AccType;
-    cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c));
+  /// Detach previously attached buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree<false>(p, pMapper);
   }
-};
 
-//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU  and intel GPU)
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
-std::vector<cl::sycl::device> supported_devices;
-auto plafrom_list =cl::sycl::platform::get_platforms();
-for(const auto& platform : plafrom_list){
-  auto device_list = platform.get_devices();
-  auto platform_name =platform.template get_info<cl::sycl::info::platform::name>();
-  std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
-  for(const auto& device : device_list){
-    auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
-    bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) ||
-    (device.is_gpu() && platform_name.find("intel")!=std::string::npos);
-    if(!unsuported_condition){
-      std::cout << "Platform name "<< platform_name << std::endl;
-        supported_devices.push_back(device);
+  /// Allocating device pointer. This pointer is actually an 8 bytes host
+  /// pointer used as key to access the sycl device buffer. The reason is that
+  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+  /// expressions. So we create a key pointer to be used in Eigen expression
+  /// construction. When we convert the Eigen construction into the sycl
+  /// construction we use this pointer as a key in our buffer_map and we make
+  /// sure that we dedicate only one buffer only for this pointer. The device
+  /// pointer would be deleted by calling deallocate function.
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
     }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
   }
-}
-return supported_devices;
-}
 
-class QueueInterface {
-public:
-  /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
-  /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
-  template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
-#ifdef EIGEN_EXCEPTIONS
-  m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
-    for (const auto& e : l) {
-      try {
-        if (e) {
-           exception_caught_ = true;
-           std::rethrow_exception(e);
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    if (scratch_buffers.empty()) {
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+      ;
+    } else {
+      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+        auto buff = pMapper.get_buffer(*it);
+        if (buff.get_size() >= num_bytes) {
+          auto ptr = *it;
+          scratch_buffers.erase(it);
+          return ptr;
+        } else {
+          ++it;
         }
-      } catch (cl::sycl::exception e) {
-        std::cerr << e.what() << std::endl;
       }
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
     }
-  }))
 #else
-m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
-  for (const auto& e : l) {
-      if (e) {
-         exception_caught_ = true;
-         std::cerr << "Error detected Inside Sycl Device."<< std::endl;
-
-      }
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+#endif
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
   }
-}))
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return static_cast<data_t *>(data.get_virtual_pointer());
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.insert(p);
+#else
+    TensorSycl::internal::SYCLfree(p, pMapper);
 #endif
-  {}
-
-  /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
-  /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
-  /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we
-  /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
-  /// The device pointer would be deleted by calling deallocate function.
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes));
-    auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
-    buf.set_final_data(nullptr);
-    buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf));
-    return static_cast<void*>(ptr);
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+    deallocate_temp(p.get_virtual_pointer());
   }
 
   /// This is used to deallocate the device pointer. p is used as a key inside
   /// the map to find the device buffer and delete it.
   EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto it = buffer_map.find(static_cast<const uint8_t*>(p));
-    if (it != buffer_map.end()) {
-      buffer_map.erase(it);
-    }
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree(p, pMapper);
   }
 
   EIGEN_STRONG_INLINE void deallocate_all() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    buffer_map.clear();
-  }
-  /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
-  /// pointer created as a key we find the sycl buffer and get the host accessor with write mode
-  /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that
-  /// this buffer is accessed, the data will be copied to the device.
-  /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark
-  /// Also, this is faster as it uses the map_allocator instead of memcpy
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
-    auto it =find_buffer(dst);
-    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first;
-    offset/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-      auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n));
-      m_queue.submit([&](cl::sycl::handler &cgh) {
-        auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
-        auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-        typedef decltype(src_acc) read_accessor;
-        typedef decltype(dst_acc) write_accessor;
-        cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0));
-      });
-      synchronize();
-  }
-  /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
-  /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
-  /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
-  /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
-  /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
-  /// to the cpu only once per function call.
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
-    auto it =find_buffer(src);
-    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
-    offset/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-      auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
-      m_queue.submit([&](cl::sycl::handler &cgh) {
-        auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-        auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-        typedef decltype(src_acc) read_accessor;
-        typedef decltype(dst_acc) write_accessor;
-        cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
-      });
-      synchronize();
-  }
-
-  /// the memcpy function
-  template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    auto it1 = find_buffer(static_cast<const void*>(src));
-    auto it2 = find_buffer(dst);
-    auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
-    auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
-    offset/=sizeof(Index);
-    i/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-    m_queue.submit([&](cl::sycl::handler &cgh) {
-      auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-      auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(src_acc) read_accessor;
-      typedef decltype(dst_acc) write_accessor;
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
-    });
-    synchronize();
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
   }
 
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+        write_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+      auto non_deleter = [](buffer_scalar_t const *) {};
+      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(s_ptr, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+        read_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+      auto non_deleter = [](buffer_scalar_t *) {};
+      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(src_acc, s_ptr);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      cgh.copy(src_acc, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
+  }
+
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n, tileSize, rng, GRange);
-    auto it1 = find_buffer(static_cast<const void*>(data));
-    ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
-    m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
-    synchronize();
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+      // The cast to uint8_t is here to match the behaviour of the standard
+      // memset. The cast to buffer_scalar_t is needed to match the type of the
+      // accessor (in case buffer_scalar_t is not uint8_t)
+      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
+  }
+
+  /// Get a range accessor to the virtual pointer's device memory. This range
+  /// accessor will allow access to the memory from the pointer to the end of
+  /// the buffer.
+  ///
+  /// NOTE: Inside a kernel the range accessor will always be indexed from the
+  /// start of the buffer, so the offset in the accessor is only used by
+  /// methods like handler::copy and will not be available inside a kernel.
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+    auto original_buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    const ptrdiff_t typed_offset = offset / sizeof(T);
+    eigen_assert(typed_offset >= 0);
+    const auto typed_size = original_buffer.get_size() / sizeof(T);
+    auto buffer = original_buffer.template reinterpret<
+        typename Eigen::internal::remove_const<T>::type>(
+        cl::sycl::range<1>(typed_size));
+    const ptrdiff_t size = buffer.get_count() - typed_offset;
+    eigen_assert(size >= 0);
+    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+                               1, AcMd, global_access, is_place_holder>
+        placeholder_accessor_t;
+    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+                                           cl::sycl::id<1>(typed_offset)),
+                    static_cast<size_t>(typed_offset),
+                    reinterpret_cast<std::intptr_t>(start_ptr));
+  }
+
+  /// Get a range accessor to the virtual pointer's device memory with a
+  /// specified size.
+  template <cl::sycl::access::mode AcMd, typename Index>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+                     const Index n_bytes) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    eigen_assert(n_bytes >= 0);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    auto buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset >= 0);
+    eigen_assert(offset + n_bytes <= buffer.get_size());
+    return buffer.template get_access<AcMd, global_access>(
+        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
   }
 
   /// Creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
-  /// the function then adds an entry by creating a sycl buffer for that particular pointer.
-  template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
-    return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if
+  /// not, the function then adds an entry by creating a sycl buffer for that
+  /// particular pointer.
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr)
+        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+            cgh);
   }
 
-  /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
-    return find_buffer(ptr)->second;
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr);
   }
 
   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_offset(ptr);
   }
 
   EIGEN_STRONG_INLINE void synchronize() const {
-    m_queue.wait_and_throw(); //pass
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
   }
 
-  EIGEN_STRONG_INLINE void asynchronousExec() const {
-    ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
-    //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled
-    m_queue.wait_and_throw(); //pass
+  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+    set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+    synchronize();
+#endif
   }
 
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
-    tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
-    auto s=  m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
-      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
-    }
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
     rng = n;
-    if (rng==0) rng=static_cast<Index>(1);
-    GRange=rng;
-    if (tileSize>GRange) tileSize=GRange;
-    else if(GRange>tileSize){
-      Index xMode =  static_cast<Index>(GRange % tileSize);
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
       if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
     }
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+                                              Index &tileSize0,
+                                              Index &tileSize1, Index &rng0,
+                                              Index &rng1, Index &GRange0,
+                                              Index &GRange1) const {
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+    tileSize1 =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    rng1 = dim1;
+    if (rng1 == 0) rng1 = static_cast<Index>(1);
+    GRange1 = rng1;
+    if (tileSize1 > GRange1)
+      tileSize1 = GRange1;
+    else if (GRange1 > tileSize1) {
+      Index xMode = static_cast<Index>(GRange1 % tileSize1);
       if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
     }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
+    tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
     rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+    if (rng0 == 0) rng0 = static_cast<Index>(1);
+    GRange0 = rng0;
+    if (tileSize0 > GRange0)
+      tileSize0 = GRange0;
+    else if (GRange0 > tileSize0) {
+      Index xMode = static_cast<Index>(GRange0 % tileSize0);
       if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
     }
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+      Index &GRange1, Index &GRange2) const {
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
-    rng2=dim2;
-    if (rng2==0 ) rng1=static_cast<Index>(1);
-    GRange2=rng2;
-    if (tileSize2>GRange2) tileSize2=GRange2;
-    else if(GRange2>tileSize2){
-      Index xMode =  static_cast<Index>(GRange2 % tileSize2);
+    tileSize2 =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    rng2 = dim2;
+    if (rng2 == 0) rng1 = static_cast<Index>(1);
+    GRange2 = rng2;
+    if (tileSize2 > GRange2)
+      tileSize2 = GRange2;
+    else if (GRange2 > tileSize2) {
+      Index xMode = static_cast<Index>(GRange2 % tileSize2);
       if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
     }
-    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+    pow_of_2 = static_cast<Index>(
+        std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
+    tileSize1 =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    rng1 = dim1;
+    if (rng1 == 0) rng1 = static_cast<Index>(1);
+    GRange1 = rng1;
+    if (tileSize1 > GRange1)
+      tileSize1 = GRange1;
+    else if (GRange1 > tileSize1) {
+      Index xMode = static_cast<Index>(GRange1 % tileSize1);
       if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
     }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
+    tileSize0 =
+        static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
     rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+    if (rng0 == 0) rng0 = static_cast<Index>(1);
+    GRange0 = rng0;
+    if (tileSize0 > GRange0)
+      tileSize0 = GRange0;
+    else if (GRange0 > tileSize0) {
+      Index xMode = static_cast<Index>(GRange0 % tileSize0);
       if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
     }
   }
 
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type ==
+           cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+    return m_device_info.max_mem_alloc_size;
+  }
+
   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+    return m_device_info.max_compute_units;
   }
 
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+    return m_device_info.max_work_group_size;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return m_device_info.max_work_item_sizes;
   }
 
   /// No need for sycl it should act the same as CPU version
   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
 
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesn't have such concept
+    // OpenCL doesnot have such concept
     return 2;
   }
 
   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+    return m_device_info.local_mem_size;
   }
 
-  EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;}
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
+
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return m_device_info.platform_name;
+  }
+
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return m_device_info.device_name;
+  }
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return m_device_info.device_vendor;
+  }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
 
   // This function checks if the runtime recorded an error for the
   // underlying stream device.
   EIGEN_STRONG_INLINE bool ok() const {
     if (!exception_caught_) {
-      m_queue.wait_and_throw();
+      synchronize();
     }
     return !exception_caught_;
   }
 
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    return latest_events_[std::this_thread::get_id()];
+#else
+    eigen_assert(false);
+    return cl::sycl::event();
+#endif
+  }
+
   // destructor
-  ~QueueInterface() { buffer_map.clear(); }
+  ~QueueInterface() {
+    pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
+  }
+
+ protected:
+  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    latest_events_[std::this_thread::get_id()] = e;
+#else
+    EIGEN_UNUSED_VARIABLE(e);
+#endif
+  }
+
+  void synchronize_and_callback(cl::sycl::event e,
+                                const std::function<void()> &callback) const {
+    set_latest_event(e);
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
+  }
+
+  bool sycl_async_handler(cl::sycl::exception_list l) const {
+    bool exception_caught = false;
+    for (const auto &e : l) {
+      if (e) {
+        exception_caught = true;
+#ifdef EIGEN_EXCEPTIONS
+        try {
+          std::rethrow_exception(e);
+        } catch (const cl::sycl::exception &e) {
+          std::cerr << e.what() << std::endl;
+        }
+#else
+        std::cerr << "Error detected inside Sycl device." << std::endl;
+        abort();
+#endif
+      }
+    }
+    return exception_caught;
+  }
 
-private:
   /// class members:
   bool exception_caught_ = false;
 
-  mutable std::mutex mutex_;
+  mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+  mutable std::mutex event_mutex_;
+  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
 
   /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
-  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
-  mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map;
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of
+  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+  /// host we should manually deallocate it.
+  mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+  mutable std::unordered_set<void *> scratch_buffers;
+#endif
   /// sycl queue
   mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  mutable cl::sycl::program m_prog;
+#endif
 
-  EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
-    if (it1 != buffer_map.end()){
-      return it1;
-    }
-    else{
-      for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
-        auto size = it->second.get_size();
-        if((it->first <  (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
-      }
-    }
-    std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
-    abort();
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream)
+      : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+    return m_queue_stream;
   }
 };
 
-// Here is a sycl deviuce struct which accept the sycl queue interface
+// Here is a sycl device struct which accept the sycl queue interface
 // as an input
-struct SyclDevice {
-  // class member.
-  QueueInterface* m_queue_stream;
-  /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
-  explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream)
+      : SyclDeviceBase(queue_stream) {}
+
+  // this is the accessor used to construct the evaluator
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+  }
 
   // get sycl accessor
-  template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
-    return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr);
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
   }
 
   /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
-    return m_queue_stream->get_sycl_buffer(ptr);
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    return queue_stream()->get_sycl_buffer(ptr);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+                                              Index &tileSize0,
+                                              Index &tileSize1, Index &rng0,
+                                              Index &rng1, Index &GRange0,
+                                              Index &GRange1) const {
+    queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
+                                       rng1, GRange0, GRange1);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+      Index &GRange1, Index &GRange2) const {
+    queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
+                                       tileSize2, rng0, rng1, rng2, GRange0,
+                                       GRange1, GRange2);
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
-  m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange);
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+    return queue_stream()->allocate(num_bytes);
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
-    m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1);
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return queue_stream()->allocate_temp(num_bytes);
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
-    m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2);
-
-  }
-  /// allocate device memory
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-      return m_queue_stream->allocate(num_bytes);
-  }
   /// deallocate device memory
   EIGEN_STRONG_INLINE void deallocate(void *p) const {
-     m_queue_stream->deallocate(p);
-   }
+    queue_stream()->deallocate(p);
+  }
 
-  // some runtime conditions that can be applied here
-  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    queue_stream()->deallocate_all();
+  }
 
-  /// the memcpy function
-  template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    m_queue_stream->memcpy(dst,src,n);
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return queue_stream()->get(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return queue_stream()->get(data);
   }
 
+  /// attach existing buffer
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    return queue_stream()->attach_buffer(buf);
+  }
+  /// detach buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    queue_stream()->detach_buffer(p);
+  }
   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    return m_queue_stream->get_offset(ptr);
+    return queue_stream()->get_offset(ptr);
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
 
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      Index *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
   }
-// memcpyHostToDevice
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
-     m_queue_stream->memcpyHostToDevice(dst,src,n);
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
   }
-/// here is the memcpyDeviceToHost
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
-    m_queue_stream->memcpyDeviceToHost(dst,src,n);
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
   }
-  /// Here is the implementation of memset function on sycl.
+  /// the memset function
   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    m_queue_stream->memset(data,c,n);
+    queue_stream()->memset(data, c, n);
   }
   /// returning the sycl queue
-  EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();}
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+    return queue_stream()->sycl_queue();
   }
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+    return queue_stream()->program();
+  }
+#endif
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
 
   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // We won't try to take advantage of the l2 cache for the time being, and
@@ -513,40 +894,64 @@ struct SyclDevice {
     return firstLevelCacheSize();
   }
   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return m_queue_stream->getNumSyclMultiProcessors();
+    return queue_stream()->getNumSyclMultiProcessors();
   }
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return m_queue_stream->maxSyclThreadsPerBlock();
+    return queue_stream()->maxSyclThreadsPerBlock();
+  }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return queue_stream()->maxWorkItemSizes();
   }
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesn't have such concept
-    return m_queue_stream->maxSyclThreadsPerMultiProcessor();
-  //  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+    // OpenCL doesnot have such concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
   }
   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return m_queue_stream->sharedMemPerBlock();
+    return queue_stream()->sharedMemPerBlock();
+  }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
   }
   /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return queue_stream()->majorDeviceVersion();
+  }
 
   EIGEN_STRONG_INLINE void synchronize() const {
-    m_queue_stream->synchronize(); //pass
+    queue_stream()->synchronize();
   }
-
-  EIGEN_STRONG_INLINE void asynchronousExec() const {
-    m_queue_stream->asynchronousExec();
+  EIGEN_STRONG_INLINE void async_synchronize(
+      cl::sycl::event e = cl::sycl::event()) const {
+    queue_stream()->async_synchronize(e);
+  }
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+    return queue_stream()->get_latest_event();
   }
+
   // This function checks if the runtime recorded an error for the
   // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const {
-    return m_queue_stream->ok();
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+    return queue_stream()->has_local_memory();
+  }
+  EIGEN_STRONG_INLINE long max_buffer_size() const {
+    return queue_stream()->max_buffer_size();
+  }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return queue_stream()->getPlatformName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return queue_stream()->getDeviceName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return queue_stream()->getDeviceVendor();
   }
 };
-// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
-// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
-// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
-struct SyclKernelDevice:DefaultDevice{};
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
-- 
cgit v1.2.3