diff options
15 files changed, 91 insertions, 95 deletions
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 40e283bad..af3e4b5ef 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -400,7 +400,7 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && ( defined(__SYCL_DEVICE_ONLY__) || !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -412,7 +412,7 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 05459f1d2..c1a27b5d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -31,7 +31,7 @@ struct QueueInterface { mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; - /// creating device by using selector + /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured throufh dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template<typename dev_Selector> explicit QueueInterface(dev_Selector s): #ifdef EIGEN_EXCEPTIONS @@ -52,28 +52,6 @@ struct QueueInterface { #endif {} - /// creating device by using selector - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - explicit QueueInterface(cl::sycl::device d): -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(d, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); - } - } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; - } - } - })) -#else - m_queue(cl::sycl::queue(d)) -#endif - {} - - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -162,27 +140,28 @@ struct SyclDevice { /// the buffer in the buffer_map. If found it gets the accessor from it, if not, /// the function then adds an entry by creating a sycl buffer for that particular pointer. template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(num_bytes, ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(size_t , const void * ptr) const { + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const { return m_queue_stream->find_buffer(ptr)->second; } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - EIGEN_STRONG_INLINE void parallel_for_setup(size_t n, size_t &tileSize, size_t &rng, size_t &GRange) const { - tileSize =sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; - rng = n; - if (rng==0) rng=1; - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - size_t xMode = GRange % tileSize; - if (xMode != 0) GRange += (tileSize - xMode); - } + template<typename T> + EIGEN_STRONG_INLINE void parallel_for_setup(T n, T &tileSize, T &rng, T &GRange) const { + tileSize =static_cast<T>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2); + rng = n; + if (rng==0) rng=static_cast<T>(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + T xMode = static_cast<T>(GRange % tileSize); + if (xMode != 0) GRange += static_cast<T>(tileSize - xMode); } + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -220,7 +199,7 @@ struct SyclDevice { /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. template<typename T> EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= get_sycl_buffer(n, dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); + auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); ::memcpy(host_acc.get_pointer(), src, n); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl @@ -251,10 +230,10 @@ struct SyclDevice { size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto buf_acc =get_sycl_buffer(n, static_cast<uint8_t*>(static_cast<void*>(buff))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + auto buf_acc =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(buff))). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); cgh.parallel_for<SyclDevice>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< buf_acc.get_size()) { + if (globalid< n) { for(size_t i=0; i<sizeof(T); i++) buf_acc[globalid*sizeof(T) + i] = c; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index f293869ee..2f7468d56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -135,8 +135,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; // getting final out buffer at the moment the created buffer is true because there is no need for assign -// auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output); - auto out_buffer =dev.get_sycl_buffer(self.dimensions().TotalSize(), output); + auto out_buffer =dev.get_sycl_buffer(output); /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can @@ -191,7 +190,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); - size_t range, GRange, tileSize; + typename Self::Index range, GRange, tileSize; dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); // getting final out buffer at the moment the created buffer is true because there is no need for assign /// creating the shared memory for calculating reduction. @@ -204,7 +203,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output); + auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index bb847afad..0336c9866 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -48,9 +48,9 @@ struct DeviceConvertor{ /// specialisation of the \ref ConvertToDeviceExpression struct when the node /// type is TensorMap #define TENSORMAPCONVERT(CVQual)\ -template <typename T, int Options2_, template <class> class MakePointer_>\ -struct ConvertToDeviceExpression<CVQual TensorMap<T, Options2_, MakePointer_> > {\ - typedef CVQual TensorMap<T, Options2_, MakeGlobalPointer> Type;\ +template <typename T, int Options_, template <class> class MakePointer_>\ +struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\ + typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ }; TENSORMAPCONVERT(const) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index d7551d94f..3d3142996 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -46,11 +46,11 @@ struct ExprConstructor; /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorMap #define TENSORMAP(CVQual)\ -template <typename T, int Options2_, int Options3_,\ +template <typename T, int Options_,\ template <class> class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap<T, Options2_, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorMap<T, Options3_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap<T, Options2_, MakeGlobalPointer> Type;\ +struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\ +CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ + typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 94a1452ec..06e2d5ae0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -57,8 +57,8 @@ struct AccessorConstructor{ return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))); } template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval) - -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(eval.dimensions().TotalSize(), cgh,eval.data()))){ - return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(eval.dimensions().TotalSize(), cgh,eval.data())); + -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){ + return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 0340b777f..5a2df7807 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -122,9 +122,9 @@ ASSIGNEXPR() /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorMap #define TENSORMAPEXPR(CVQual)\ -template <typename T, int Options2_, template <class> class MakePointer_, size_t N>\ -struct PlaceHolderExpression< CVQual TensorMap< T, Options2_, MakePointer_>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorMap<T, Options2_, MakePointer_>, N> Type;\ +template <typename T, int Options_, template <class> class MakePointer_, size_t N>\ +struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\ }; TENSORMAPEXPR(const) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 5742592de..306250f7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -40,16 +40,17 @@ void run(Expr &expr, Dev &dev) { dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator); - size_t range, GRange, tileSize; - dev.parallel_for_setup(utility::tuple::get<0>(tuple_of_accessors).get_range()[0]/sizeof(typename Expr::Scalar), tileSize, range, GRange); + typename Expr::Index range, GRange, tileSize; + dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange); // run the kernel cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - if (itemID.get_global_linear_id() < range) { - device_evaluator.evalScalar(static_cast<typename DevExpr::Index>(itemID.get_global_linear_id())); + typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); + if (gId < range) { + device_evaluator.evalScalar(gId); } }); }); diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index 3dbb8d553..752a61f8a 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -137,14 +137,20 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl:: test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device); test_broadcast_sycl<DataType, ColMajor, int>(sycl_device); - test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); + test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device); test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device); + // the folowing two test breaks the intel gpu and amd gpu driver (cannot create opencl kernel) + // test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); + // test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_broadcast_sycl() { for (const auto& device : cl::sycl::device::get_devices()) { + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos) CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device)); } } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index 989b335b2..dd739f470 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -264,9 +264,15 @@ static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { } void test_cxx11_tensor_builtins_sycl() { - cl::sycl::gpu_selector s; - QueueInterface queueInterface(s); - Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); + for (const auto& device : cl::sycl::device::get_devices()) { + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos){ + QueueInterface queueInterface(device); + Eigen::SyclDevice sycl_device(&queueInterface); + CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); + CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); + } + } } diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 9e13d2f1b..7f9372c04 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -72,6 +72,10 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev void test_cxx11_tensor_device_sycl() { for (const auto& device : cl::sycl::device::get_devices()) { - CALL_SUBTEST(sycl_device_test_per_device<float>(device)); + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos) + CALL_SUBTEST(sycl_device_test_per_device<float>(device)); } } diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 70b182558..4ff218cb6 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -70,12 +70,11 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev test_forced_eval_sycl<DataType, ColMajor>(sycl_device); } void test_cxx11_tensor_forced_eval_sycl() { - - printf("Test on GPU: OpenCL\n"); - CALL_SUBTEST(tensorForced_evalperDevice<float>((cl::sycl::gpu_selector()))); - printf("repeating the test on CPU: OpenCL\n"); - CALL_SUBTEST(tensorForced_evalperDevice<float>((cl::sycl::cpu_selector()))); - printf("repeating the test on CPU: HOST\n"); - CALL_SUBTEST(tensorForced_evalperDevice<float>((cl::sycl::host_selector()))); - printf("Test Passed******************\n" ); + for (const auto& device : cl::sycl::device::get_devices()) { + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos) + CALL_SUBTEST(tensorForced_evalperDevice<float>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index a16e1caf5..4ca73ea1b 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -82,14 +82,12 @@ template<typename DataType, typename dev_Selector> void sycl_slicing_test_per_de } void test_cxx11_tensor_morphing_sycl() { - /// Currentlly it only works on cpu. Adding GPU cause LLVM ERROR in cunstructing OpenCL Kernel at runtime. -// printf("Test on GPU: OpenCL\n"); -// CALL_SUBTEST(sycl_device_test_per_device((cl::sycl::gpu_selector()))); - printf("repeating the test on CPU: OpenCL\n"); - CALL_SUBTEST(sycl_slicing_test_per_device<float>((cl::sycl::cpu_selector()))); - printf("repeating the test on CPU: HOST\n"); - CALL_SUBTEST(sycl_slicing_test_per_device<float>((cl::sycl::host_selector()))); - printf("Test Passed******************\n" ); - - + for (const auto& device : cl::sycl::device::get_devices()) { + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + /// Currentlly it only works on cpu. Adding GPU cause LLVM ERROR in cunstructing OpenCL Kernel at runtime. + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(device.is_cpu() && s.find("amd")==std::string::npos) + CALL_SUBTEST(sycl_slicing_test_per_device<float>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 9e20f9cd0..32cfb94c2 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -142,6 +142,10 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl:: } void test_cxx11_tensor_reduction_sycl() { for (const auto& device : cl::sycl::device::get_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device<float>(device)); + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos) + CALL_SUBTEST(sycl_reduction_test_per_device<float>(device)); } } diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index bf115d652..670b5f379 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -197,11 +197,11 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_ test_sycl_computations<DataType, ColMajor>(sycl_device); } void test_cxx11_tensor_sycl() { - printf("Test on GPU: OpenCL\n"); - CALL_SUBTEST(sycl_computing_test_per_device<float>((cl::sycl::gpu_selector()))); - printf("repeating the test on CPU: OpenCL\n"); - CALL_SUBTEST(sycl_computing_test_per_device<float>((cl::sycl::cpu_selector()))); - printf("repeating the test on CPU: HOST\n"); - CALL_SUBTEST(sycl_computing_test_per_device<float>((cl::sycl::host_selector()))); - printf("Test Passed******************\n" ); + for (const auto& device : cl::sycl::device::get_devices()) { + /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + auto s= device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(!device.is_cpu() || s.find("amd")==std::string::npos) + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); + } } |