aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2017-02-06 18:05:23 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2017-02-06 18:05:23 +0000
commit42bd5c4e7b8f4b5875ae256e7ac20310161d8470 (patch)
tree61b825272a2a08f782ff453271e8488565e1738c
parentbc128f9f3beff5a2a3225f03ec2e5eb111a15b87 (diff)
Fixing TensorReductionSycl for min and max.
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h30
-rw-r--r--unsupported/test/cxx11_tensor_reduction_sycl.cpp9
4 files changed, 33 insertions, 18 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 8ecef59a8..9dcb42904 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -26,10 +26,10 @@ namespace Eigen {
namespace internal {
template<typename CoeffReturnType> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+template<typename OP, typename BufferTOut, typename BufferTIn>
+static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
@@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local));
+ h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@@ -123,7 +123,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
/// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+ syclGenericBufferReducer<CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 84f660597..9d5a6d4c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -35,7 +35,7 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+ template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 710e22474..a77f408de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -18,13 +18,14 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+ template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+ OP op;
OutputAccessor aOut;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
- GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+ GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+ : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
@@ -44,7 +45,12 @@ namespace internal {
auto min = (length < local) ? length : local;
for (size_t offset = min / 2; offset > 0; offset /= 2) {
if (localid < offset) {
- scratch[localid] += scratch[localid + offset];
+ auto accum = op.initialize();
+ op.reduce(scratch[localid], &accum);
+ op.reduce(scratch[localid + offset], &accum);
+ op.finalize(accum);
+ scratch[localid]=accum;
+ //scratch[localid] += scratch[localid + offset];
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
@@ -131,11 +137,21 @@ public:
if(globalid<rng)
tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
else
- tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+ tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(op.initialize());
- if(remaining!=0 && globalid==0 )
+ if(remaining!=0 && globalid==0 ){
// this will add the rest of input buffer when the input size is not devidable to red_factor.
- tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+ // tmp_global_accessor.get_pointer()[0]+=
+ auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
+ reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+ auto accum = op.initialize();
+ op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+ op.reduce(remaining_reduce, &accum);
+ op.finalize(accum);
+ tmp_global_accessor.get_pointer()[0]=accum;
+
+
+ }
}
};
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 98a59a14c..251091f5b 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -34,7 +34,7 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
in.setRandom();
- full_redux = in.sum();
+ full_redux = in.minimum();
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
@@ -43,11 +43,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.sum();
+ out_gpu.device(sycl_device) = in_gpu.minimum();
sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
-
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
@@ -69,7 +68,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
in.setRandom();
- redux= in.sum(red_axis);
+ redux= in.maximum(red_axis);
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
@@ -78,7 +77,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+ out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.