From a725a3233c98185eb3e5db6186aea3a906b8411f Mon Sep 17 00:00:00 2001 From: mehdi-goli Date: Tue, 27 Oct 2020 16:31:33 +0000 Subject: [SYCL clean up the code] : removing exrta #pragma unroll in SYCL which was causing issues in embeded systems --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'unsupported/Eigen/CXX11') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 387c3edf4..474eba06f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -100,7 +100,6 @@ struct SecondStepFullReducer { CoeffReturnType accumulator = *aInPtr; scratchptr[localid] = op.finalize(accumulator); -#pragma unroll 8 for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { itemID.barrier(cl::sycl::access::fence_space::local_space); if (localid < offset) { @@ -154,7 +153,6 @@ class FullReductionKernelFunctor { Index start = Evaluator::PacketSize * globalid; // vectorizable parts PacketReturnType packetAccumulator = op.template initializePacket(); -#pragma unroll(8 / Evaluator::PacketSize) for (Index i = start; i < VectorizedRange; i += step) { op.template reducePacket(evaluator.impl().template packet(i), &packetAccumulator); } @@ -293,7 +291,6 @@ struct PartialReductionKernel { const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; const Index per_thread_global_stride = rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; -#pragma unroll 8 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { op.reduce(evaluator.impl().coeff(global_offset), &accumulator); localOffset += per_thread_local_stride; @@ -391,7 +388,6 @@ struct SecondStepPartialReduction { OutScalar accumulator = op.initialize(); // num_coeffs_to_reduce is not bigger that 256 -#pragma unroll 8 for (Index i = 0; i < num_coeffs_to_reduce; i++) { op.reduce(*in_ptr, &accumulator); in_ptr += num_coeffs_to_preserve; -- cgit v1.2.3