From 7318daf887c4f06fa62e59e29fa675e48ad168f9 Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Fri, 25 Nov 2016 16:19:07 +0000
Subject: Fixing LLVM error on TensorMorphingSycl.h on GPU; fixing int64_t
 crash for tensor_broadcast_sycl on GPU; adding get_sycl_supported_devices()
 on syclDevice.h.

---
 Eigen/src/Core/util/Macros.h                       |  4 +-
 unsupported/Eigen/CXX11/Tensor                     |  2 +
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h      | 54 +++++++-------
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      | 15 +++-
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |  9 +++
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h  |  8 +++
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |  6 ++
 .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h   | 48 ++++++++-----
 unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h    |  4 +-
 .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h    | 83 ++++++++++++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h       |  3 +
 unsupported/test/cxx11_tensor_broadcast_sycl.cpp   | 13 +---
 unsupported/test/cxx11_tensor_builtins_sycl.cpp    | 15 ++--
 unsupported/test/cxx11_tensor_device_sycl.cpp      |  8 +--
 unsupported/test/cxx11_tensor_forced_eval_sycl.cpp |  8 +--
 unsupported/test/cxx11_tensor_morphing_sycl.cpp    |  9 +--
 unsupported/test/cxx11_tensor_reduction_sycl.cpp   |  8 +--
 unsupported/test/cxx11_tensor_sycl.cpp             |  9 +--
 18 files changed, 203 insertions(+), 103 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index af3e4b5ef..df7cef051 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -400,10 +400,12 @@
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
 #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
-  && ( defined(__SYCL_DEVICE_ONLY__) ||  !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
     // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
     //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#elif  EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) && defined(__SYCL_DEVICE_ONLY__)
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #else
 #define EIGEN_HAS_VARIADIC_TEMPLATES 0
 #endif
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 8b36093f0..2ee9e11a9 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -82,6 +82,8 @@ typedef unsigned __int64 uint64_t;
 #endif
 #endif
 
+// tuple construction
+#include "src/Tensor/TensorSyclTuple.h"
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index c1a27b5d6..c0d94b4eb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -17,6 +17,23 @@
 
 namespace Eigen {
 
+auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
+  auto devices = cl::sycl::device::get_devices();
+  std::vector<cl::sycl::device>::iterator it =devices.begin();
+  while(it!=devices.end()) {
+    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
+    auto s=  (*it).template get_info<cl::sycl::info::device::vendor>();
+    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+    if((*it).is_cpu() && s.find("amd")!=std::string::npos){
+      it=devices.erase(it);
+    }
+    else{
+      ++it;
+    }
+  }
+  printf("Device size %ld\n", devices.size());
+  return devices;
+}
 #define ConvertToActualTypeSycl(T, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<T>::pointer_t>((&(*buf_acc.get_pointer())))
 
 struct QueueInterface {
@@ -109,27 +126,6 @@ struct QueueInterface {
   ~QueueInterface() { buffer_map.clear(); }
 };
 
-template <typename T> class MemCopyFunctor {
- public:
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-  MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    auto src_ptr = ConvertToActualTypeSycl(T, m_src_acc);
-    auto dst_ptr = ConvertToActualTypeSycl(T, m_dst_acc);
-    auto globalid = itemID.get_global_linear_id();
-    if (globalid < m_rng) {
-      dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
-    }
-  }
- private:
-  read_accessor m_src_acc;
-  write_accessor m_dst_acc;
-  size_t m_rng;
-  size_t m_i;
-  size_t m_offset;
-};
-
 struct SyclDevice {
   // class member.
   QueueInterface* m_queue_stream;
@@ -150,16 +146,16 @@ struct SyclDevice {
   }
 
   /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename T>
-  EIGEN_STRONG_INLINE void parallel_for_setup(T n, T &tileSize, T &rng, T &GRange)  const {
-    tileSize =static_cast<T>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
+  template<typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
+    tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
     rng = n;
-    if (rng==0) rng=static_cast<T>(1);
+    if (rng==0) rng=static_cast<Index>(1);
     GRange=rng;
     if (tileSize>GRange) tileSize=GRange;
     else if(GRange>tileSize){
-      T xMode =  static_cast<T>(GRange % tileSize);
-      if (xMode != 0) GRange += static_cast<T>(tileSize - xMode);
+      Index xMode =  static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
     }
   }
   /// allocate device memory
@@ -188,7 +184,7 @@ struct SyclDevice {
     sycl_queue().submit([&](cl::sycl::handler &cgh) {
       auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
       auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
+      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
     });
     sycl_queue().throw_asynchronous();
   }
@@ -219,7 +215,7 @@ struct SyclDevice {
     sycl_queue().submit([&](cl::sycl::handler &cgh) {
       auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
       auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
+      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
     });
     sycl_queue().throw_asynchronous();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index b24cdebf1..ca45b542e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -68,7 +68,13 @@ struct fixed_size_tensor_index_extraction_helper
                           const Dimensions& dimensions)
   {
     const Index mult = (index == n-1) ? 1 : 0;
-    return array_get<n-1>(dimensions) * mult +
+    return
+#ifdef EIGEN_USE_SYCL
+      utility::tuple::get<n-1>(dimensions)
+#else
+      array_get<n-1>(dimensions)
+#endif
+      * mult +
         fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
   }
 };
@@ -92,6 +98,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
 template <typename std::ptrdiff_t... Indices>
 struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  #ifdef EIGEN_USE_SYCL
+  const decltype(utility::tuple::make_tuple(Indices...)) t= utility::tuple::make_tuple(Indices...);
+  #endif
   static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
@@ -120,7 +129,11 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
+#ifdef EIGEN_USE_SYCL
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+#else
     return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
+#endif
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 52b803d7f..8582f7d8a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -21,6 +21,15 @@ namespace Eigen {
 template<typename T> struct MakePointer {
   typedef T* Type;
 };
+#if defined(EIGEN_USE_SYCL)
+namespace TensorSycl {
+namespace internal{
+template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
+template <typename T> class MemCopyFunctor;
+}
+}
+#endif
+
 
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ede3939c2..eea25ac33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -37,6 +37,8 @@ namespace {
   {
 #ifdef __CUDA_ARCH__
     return __clz(val);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
     unsigned long index;
     _BitScanReverse(&index, val);
@@ -53,6 +55,8 @@ namespace {
   {
 #ifdef __CUDA_ARCH__
     return __clzll(val);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
     unsigned long index;
     _BitScanReverse64(&index, val);
@@ -88,6 +92,8 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
 #if defined(__CUDA_ARCH__)
     return __umulhi(a, b);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -97,6 +103,8 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
 #if defined(__CUDA_ARCH__)
     return __umul64hi(a, b);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
 #elif defined(__SIZEOF_INT128__)
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 41d0d0022..75518a854 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -13,6 +13,7 @@
 
 namespace Eigen {
 
+
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -691,6 +692,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+#if defined(EIGEN_USE_SYCL)
+ template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
+
+#endif
+
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
   // Returns the Index in the input tensor of the first value that needs to be
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 2f7468d56..00f8b70ed 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,6 +25,7 @@
 namespace Eigen {
 namespace internal {
 
+
 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
 static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
@@ -180,6 +181,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
 };
 
+
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
@@ -190,42 +192,50 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
+    typedef decltype(functors) FunctorExpr;
     typename Self::Index range, GRange, tileSize;
-    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
+  //  Dims dims= self.xprDims();
+    //Op functor = reducer;
 
+      dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
       dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
 
-      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
+      TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr,  FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
+      (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
+
+
+  //     [=](cl::sycl::nd_item<1> itemID) {
+  //      typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+  //      auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
         /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
         /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
         /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+  //      const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
         /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
         /// the device_evaluator is detectable and recognisable on the device.
-        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
+  //      typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
+  //      auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+  //      auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
         /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-        if (globalid< range) {
-          typename DeviceSelf::CoeffReturnType accum = functor.initialize();
-          GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
-          functor.finalize(accum);
-          output_accessor_ptr[globalid]= accum;
-        }
-      });
+  //      auto globalid=itemID.get_global_linear_id();
+  //      if (globalid< range) {
+  //        typename DeviceSelf::CoeffReturnType accum = functor.initialize();
+  //        GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
+  //        functor.finalize(accum);
+  //        output_accessor_ptr[globalid]= accum;
+  //      }
+  //    });
     });
     dev.sycl_queue().throw_asynchronous();
     return false;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index bb8800d45..c099cb42e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -47,8 +47,6 @@ template<typename T> struct GetType<false, T>{
 }
 }
 
-// tuple construction
-#include "TensorSyclTuple.h"
 
 // counting number of leaf at compile time
 #include "TensorSyclLeafCount.h"
@@ -77,6 +75,8 @@ template<typename T> struct GetType<false, T>{
 
 // kernel execution using fusion
 #include "TensorSyclRun.h"
+//sycl functors
+#include "TensorSyclFunctors.h"
 
 #endif  // end of EIGEN_USE_SYCL
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
new file mode 100644
index 000000000..adbb2ae72
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: eigen@codeplay.com
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// General include header of SYCL target for Tensor Module
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// ReductionFunctor
+template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
+ public:
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
+  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
+  void operator()(cl::sycl::nd_item<1> itemID) {
+
+    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+    /// the device_evaluator is detectable and recognisable on the device.
+    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
+    /// const cast added as a naive solution to solve the qualifier drop error
+    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid< range) {
+      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
+      functor.finalize(accum);
+      output_accessor_ptr[globalid]= accum;
+    }
+  }
+ private:
+  write_accessor output_accessor;
+  FunctorExpr functors;
+  Tuple_of_Acc tuple_of_accessors;
+  Dims dims;
+  Op functor;
+  Index range;
+};
+
+/// Memcopyfuncdeveicetohost
+template <typename T> class MemCopyFunctor {
+ public:
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+  MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    auto src_ptr = ConvertToActualTypeSycl(T, m_src_acc);
+    auto dst_ptr = ConvertToActualTypeSycl(T, m_dst_acc);
+    auto globalid = itemID.get_global_linear_id();
+    if (globalid < m_rng) {
+      dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
+    }
+  }
+ private:
+  read_accessor m_src_acc;
+  write_accessor m_dst_acc;
+  size_t m_rng;
+  size_t m_i;
+  size_t m_offset;
+};
+
+}
+}
+}
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 063b027e8..c0e48524d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -20,6 +20,8 @@
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
+#ifdef EIGEN_USE_SYCL
+
 namespace utility {
 namespace tuple {
 /// \struct StaticIf
@@ -231,4 +233,5 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
 }
 }  // tuple
 }  // utility
+#endif //EIGEN_USE_SYCL
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 752a61f8a..c426549f1 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -136,21 +136,14 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
   test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
   test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
   test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
-
-
   test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  // the folowing two test breaks the intel gpu and amd gpu driver (cannot create opencl kernel)
-  // test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
-  // test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
 }
 
 void test_cxx11_tensor_broadcast_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos)
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index dd739f470..d8c2898ca 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -264,15 +264,10 @@ static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
 }
 
 void test_cxx11_tensor_builtins_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos){
-      QueueInterface queueInterface(device);
-      Eigen::SyclDevice sycl_device(&queueInterface);
-      CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
-      CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
-    }
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(device);
+    Eigen::SyclDevice sycl_device(&queueInterface);
+    CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
+    CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 7f9372c04..190dba862 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -71,11 +71,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
 }
 
 void test_cxx11_tensor_device_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos)
-      CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 4ff218cb6..4d19a3b2a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -70,11 +70,7 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
   test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
 }
 void test_cxx11_tensor_forced_eval_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos)
-      CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 4ca73ea1b..9074c8331 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -82,12 +82,7 @@ template<typename DataType, typename dev_Selector> void sycl_slicing_test_per_de
 }
 void test_cxx11_tensor_morphing_sycl()
 {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    /// Currentlly it only works on cpu. Adding GPU cause LLVM ERROR in cunstructing OpenCL Kernel at runtime.
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(device.is_cpu() && s.find("amd")==std::string::npos)
-      CALL_SUBTEST(sycl_slicing_test_per_device<float>(device));
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_slicing_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 32cfb94c2..941469029 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -141,11 +141,7 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
   test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
 }
 void test_cxx11_tensor_reduction_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos)
-      CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 670b5f379..150414f15 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -197,11 +197,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
   test_sycl_computations<DataType, ColMajor>(sycl_device);
 }
 void test_cxx11_tensor_sycl() {
-  for (const auto& device : cl::sycl::device::get_devices()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  device.template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(!device.is_cpu() || s.find("amd")==std::string::npos)
-      CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  auto devices =Eigen::get_sycl_supported_devices();
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
   }
 }
-- 
cgit v1.2.3