From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 245 +++++++++++++++------ 1 file changed, 172 insertions(+), 73 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 6f8b6f193..df591c21d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,7 +16,6 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H #include - namespace Eigen { namespace TensorSycl { @@ -70,9 +69,9 @@ struct SyclDeviceInfo { } // end namespace TensorSycl typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; -// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and -// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently -// TensorFlow via the Eigen SYCL Backend. +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) { #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR @@ -421,6 +420,91 @@ class QueueInterface { return pMapper.get_offset(ptr); } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + EIGEN_STRONG_INLINE void synchronize() const { #ifdef EIGEN_EXCEPTIONS m_queue.wait_and_throw(); @@ -429,6 +513,7 @@ class QueueInterface { #endif } + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { set_latest_event(e); #ifndef EIGEN_SYCL_ASYNC_EXECUTION @@ -457,11 +542,10 @@ class QueueInterface { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -469,26 +553,28 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 = + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); } - tileSize0 = static_cast(max_workgroup_Size / tileSize1); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); } } @@ -496,9 +582,9 @@ class QueueInterface { /// threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -506,45 +592,48 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 = + local_range[2] = static_cast(std::pow(2, static_cast(pow_of_2 / 3))); - rng2 = dim2; - if (rng2 == 0) rng1 = static_cast(1); - GRange2 = rng2; - if (tileSize2 > GRange2) - tileSize2 = GRange2; - else if (GRange2 > tileSize2) { - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast(local_range[2] - xMode); } pow_of_2 = static_cast( - std::log2(static_cast(max_workgroup_Size / tileSize2))); - tileSize1 = + std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); } - tileSize0 = - static_cast(max_workgroup_Size / (tileSize1 * tileSize2)); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + local_range[0] = static_cast(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); } } EIGEN_STRONG_INLINE bool has_local_memory() const { -#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) return false; #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) return true; @@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { - queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, - rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { - queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, - tileSize2, rng0, rng1, rng2, GRange0, - GRange1, GRange2); + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// allocate device memory @@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher( + var...); + } + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher( + var...); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher( + var...); + } }; } // end namespace Eigen -- cgit v1.2.3