aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2017-02-28 17:16:14 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2017-02-28 17:16:14 +0000
commit8296b87d7bd98c19c6064241880691f164790ede (patch)
treebbd18de82debbf021c7643017f9588a16374934f /unsupported/Eigen/CXX11/src
parente0bd6f5738b94e8d7a4b17b61bf9cb6418685f28 (diff)
Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used.
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h21
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h61
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h27
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h43
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h21
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h27
12 files changed, 230 insertions, 66 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index abc7ba551..fcd7d4d00 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
+ if (data) {
evalTo(data);
return false;
} else {
@@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
LhsLocalAcc localLhs;
RhsLocalAcc localRhs;
OutAccessor out_res;
+ size_t out_offset;
Index roundUpK, M, N, K;
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
LeftNocontractT m_i_strides, m_left_nocontract_strides;
@@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
Device dev;
- KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+ KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
- :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+ :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
+ out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
m_right_contracting_strides(m_right_contracting_strides_),
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
@@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
- out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
+ out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
}
}
}
@@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
// extract lhs functor list
LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
// extract rhs functor list
- RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+ RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
-
+ ptrdiff_t out_offset = self.device().get_offset(buffer);
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
/// work-around for gcc bug
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
@@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
//OutScalar memory
- OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
+ OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
// sycl parallel for
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
- localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
+ localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
});
self.device().asynchronousExec();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 4247c1c4a..66ffd819f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -32,14 +32,15 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize, range_x, range_y;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<2> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@@ -89,14 +90,15 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@@ -156,16 +158,17 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
@@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_kernel = in_place;
m_local_kernel = false;
} else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
@@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
// extract input functor list
InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+ ptrdiff_t out_offset = m_device.get_offset(data);
m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
@@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
- numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index e020d076f..c72d79435 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -140,6 +140,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
@@ -295,6 +298,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e209799bb..964222a15 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -18,6 +18,8 @@
namespace Eigen {
#define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
+ #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
+
template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
public:
@@ -43,11 +45,12 @@ namespace Eigen {
struct memsetkernelFunctor{
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
AccType m_acc;
+ const ptrdiff_t buff_offset;
const size_t m_rng, m_c;
- memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
+ memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){}
void operator()(cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
- if (globalid< m_rng) m_acc[globalid] = m_c;
+ if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
}
};
@@ -305,6 +308,11 @@ struct SyclDevice {
synchronize();
}
+ EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+ auto it = m_queue_stream->find_buffer(ptr);
+ return (static_cast<const uint8_t*>(ptr))-it->first;
+
+ }
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
/// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
/// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
@@ -343,20 +351,23 @@ struct SyclDevice {
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
size_t rng, GRange, tileSize;
parallel_for_setup(n, tileSize, rng, GRange);
- sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
+ auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(data));
+ ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
+ sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
synchronize();
}
struct memsetCghFunctor{
cl::sycl::buffer<uint8_t, 1>& m_buf;
+ const ptrdiff_t& buff_offset;
const size_t& rng , GRange, tileSize;
const int &c;
- memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
- :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
+ memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
+ :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
void operator()(cl::sycl::handler &cgh) const {
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
+ cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c));
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c3ca129e2..c9c7acfdc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -27,9 +27,9 @@ namespace internal {
template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
@@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
+ h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
- bufOut, bufI, dev, length, local);
+ bufOut, out_offset, bufI, dev, length, local);
}
};
@@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
+ ptrdiff_t out_offset = dev.get_offset(output);
/// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+ syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize);
}
};
@@ -158,10 +159,11 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
// create a tuple of accessors from Evaluator
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
+ ptrdiff_t out_offset = dev.get_offset(output);
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+ (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
});
dev.asynchronousExec();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index dd63a2e2f..9476c0ea8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -109,6 +109,7 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
};
+
// TensorForcedEvalOp
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp)
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index 117b368ec..af4eb5f13 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -236,8 +236,12 @@ EVALTO()
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
- TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+ typedef TensorForcedEvalOp<OrigExpr> XprType;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+ Eigen::internal::traits<XprType>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@@ -248,6 +252,28 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
+
+
+#define TENSORCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
+CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
+ typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+ Eigen::internal::traits<XprType>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+};
+
+TENSORCUSTOMUNARYOP(const)
+TENSORCUSTOMUNARYOP()
+#undef TENSORCUSTOMUNARYOP
+
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
static const size_t Res =X;
};
@@ -260,7 +286,7 @@ template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
- static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+ static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
Type expr;\
@@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR()
/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
+#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
- static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
- typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
- NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
- typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
- Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+ typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
+ static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
+ Eigen::internal::traits<XprTyp>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
};
-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
+SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
+SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
+SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
+#undef SYCLCONTRACTCONVCUSBIOPS
#define SYCLSLICEOPEXPR(CVQual)\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index 4a6322d44..5a6a8f4c5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC()
#undef SYCLFORCEDEVALEXTACC
+#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, typename Dev >\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+
+SYCLCUSTOMUNARYOPEXTACC(const)
+SYCLCUSTOMUNARYOPEXTACC()
+#undef SYCLCUSTOMUNARYOPEXTACC
+
+
+#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+SYCLCUSTOMBINARYOPEXTACC(const)
+SYCLCUSTOMBINARYOPEXTACC()
+#undef SYCLCUSTOMBIBARYOPEXTACC
+
+
+
+
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
#define SYCLEVALTOEXTACC(CVQual)\
template <typename Expr, typename Dev>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index 8828a0495..9fcac5ecb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -33,14 +33,17 @@ namespace internal {
/// re-instantiate them on the device.
/// We have to pass instantiated functors to the device.
// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
+#define DEFALTACTION(Evaluator)\
+typedef typename Evaluator::Dimensions Dimensions;\
+const Dimensions m_dimensions;\
+EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
+
template <typename Evaluator> struct FunctorExtractor{
- typedef typename Evaluator::Dimensions Dimensions;
- const Dimensions m_dimensions;
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- FunctorExtractor(const Evaluator& expr)
- : m_dimensions(expr.dimensions()) {}
+ DEFALTACTION(Evaluator)
};
+
/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
///TensorConversionOp
#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
@@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const)
SYCLEXTRFUNCTERNARY()
#undef SYCLEXTRFUNCTERNARY
+
+
+//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
+//from the UnaryCategory and it is similar to the general FunctorExtractor.
+/// specialisation of TensorCustomOp
+#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
+ typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
+ DEFALTACTION(Evaluator)\
+};
+
+SYCLEXTRFUNCCUSTOMUNARYOP(const)
+SYCLEXTRFUNCCUSTOMUNARYOP()
+#undef SYCLEXTRFUNCCUSTOMUNARYOP
+
+
+#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
+template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
+ typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
+ DEFALTACTION(Evaluator)\
+};
+
+SYCLEXTRFUNCCUSTOMBIBARYOP(const)
+SYCLEXTRFUNCCUSTOMBIBARYOP()
+#undef SYCLEXTRFUNCCUSTOMBIBARYOP
+
+
+
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
#define SYCLEXTRFUNCSELECTOP(CVQual)\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 2f7779036..12237bfab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -21,11 +21,12 @@ namespace internal {
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
OP op;
OutputAccessor aOut;
+ ptrdiff_t out_offset;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
- GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+ GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+ : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
@@ -59,7 +60,7 @@ namespace internal {
aI[itemID.get_group(0)] = scratch[localid];
if((length<=local) && globalid ==0){
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
- aOutPtr[0]=scratch[0];
+ aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
}
}
}
@@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
+ ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
+ :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
- output_accessor_ptr[globalid]= accum;
+ output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
}
}
private:
write_accessor output_accessor;
+ ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;
@@ -111,9 +113,9 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
+ ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
+ :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
- output_accessor_ptr[globalid]= accum/num_values_to_reduce;
+ output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
}
}
private:
write_accessor output_accessor;
+ ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index 50f4595fc..330283b39 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const)
SYCLFORCEDEVALLEAFCOUNT()
#undef SYCLFORCEDEVALLEAFCOUNT
+#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
+template <typename CustomUnaryFunc, typename XprType>\
+struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
+static const size_t Count =1;\
+};
+
+SYCLCUSTOMUNARYOPLEAFCOUNT(const)
+SYCLCUSTOMUNARYOPLEAFCOUNT()
+#undef SYCLCUSTOMUNARYOPLEAFCOUNT
+
+
+#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
+static const size_t Count =1;\
+};
+SYCLCUSTOMBINARYOPLEAFCOUNT( const)
+SYCLCUSTOMBINARYOPLEAFCOUNT()
+#undef SYCLCUSTOMBINARYOPLEAFCOUNT
+
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\
template <typename Expr>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index fcef0be04..99d528963 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -143,6 +143,33 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define CUSTOMUNARYOPEVAL(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
+};
+
+CUSTOMUNARYOPEVAL(const)
+CUSTOMUNARYOPEVAL()
+#undef CUSTOMUNARYOPEVAL
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define CUSTOMBINARYOPEVAL(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
+};
+
+CUSTOMBINARYOPEVAL(const)
+CUSTOMBINARYOPEVAL()
+#undef CUSTOMBINARYOPEVAL
+
+
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorEvalToOp, TensorLayoutSwapOp
#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\