aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/Tensor3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md83
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h32
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h147
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h43
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h443
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h70
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h70
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h93
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h7
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h66
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h175
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h85
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h198
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h75
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h96
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h288
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h71
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h2
-rwxr-xr-xunsupported/Eigen/src/AutoDiff/AutoDiffScalar.h7
-rw-r--r--unsupported/Eigen/src/EulerAngles/EulerAngles.h2
-rw-r--r--unsupported/Eigen/src/IterativeSolvers/Scaling.h6
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h41
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h14
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h2
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h4
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h2
-rw-r--r--unsupported/test/CMakeLists.txt52
-rw-r--r--unsupported/test/EulerAngles.cpp3
-rw-r--r--unsupported/test/autodiff_scalar.cpp15
-rw-r--r--unsupported/test/cxx11_tensor_argmax_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_argmax_sycl.cpp245
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_complex_cuda.cu39
-rw-r--r--unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_contract_cuda.cu4
-rw-r--r--unsupported/test/cxx11_tensor_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_custom_op_sycl.cpp165
-rw-r--r--unsupported/test/cxx11_tensor_device.cu4
-rw-r--r--unsupported/test/cxx11_tensor_forced_eval_sycl.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_generator_sycl.cpp147
-rw-r--r--unsupported/test/cxx11_tensor_image_patch_sycl.cpp1092
-rw-r--r--unsupported/test/cxx11_tensor_inflation_sycl.cpp136
-rw-r--r--unsupported/test/cxx11_tensor_layout_swap_sycl.cpp126
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_cuda.cu4
-rw-r--r--unsupported/test/cxx11_tensor_patch_sycl.cpp249
-rw-r--r--unsupported/test/cxx11_tensor_random_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_scan_cuda.cu4
-rw-r--r--unsupported/test/cxx11_tensor_trace.cpp171
-rw-r--r--unsupported/test/cxx11_tensor_volume_patch_sycl.cpp222
94 files changed, 4641 insertions, 612 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 39916092b..d243fe035 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -19,7 +19,7 @@
#undef isnan
#undef isinf
#undef isfinite
-#include <SYCL/sycl.hpp>
+#include <CL/sycl.hpp>
#include <iostream>
#include <map>
#include <memory>
@@ -141,6 +141,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorGenerator.h"
#include "src/Tensor/TensorAssign.h"
#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
#include "src/Tensor/TensorSycl.h"
#include "src/Tensor/TensorExecutor.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 38cdb9c69..30d553af7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -83,8 +83,8 @@ large enough to hold all the data.
// You can also map fixed-size tensors. Here we get a 1d view of
// the 2d fixed-size tensor.
- Tensor<float, Sizes<4, 5>> t_4x3;
- TensorMap<Tensor<float, 1>> t_12(t_4x3, 12);
+ TensorFixedSize<float, Sizes<4, 5>> t_4x3;
+ TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
#### Class TensorRef
@@ -272,7 +272,7 @@ Operation to a TensorFixedSize instead of a Tensor, which is a bit more
efficient.
// We know that the result is a 4x4x2 tensor!
- TensorFixedSize<float, 4, 4, 2> result = t5;
+ TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
Simiarly, assigning an expression to a TensorMap causes its evaluation. Like
tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
@@ -296,7 +296,7 @@ the expression in a temporary Tensor of the right size. The code above in
effect does:
// .eval() knows the size!
- TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2;
+ TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
Tensor<float, 3> result = (tmp * 0.2f).exp();
Note that the return value of ```eval()``` is itself an Operation, so the
@@ -567,11 +567,11 @@ to the rank of the tensor. The content of the tensor is not initialized.
### TensorFixedSize
-Creates a tensor of the specified size. The number of arguments in the Size<>
+Creates a tensor of the specified size. The number of arguments in the Sizes<>
template parameter determines the rank of the tensor. The content of the tensor
is not initialized.
- Eigen::TensorFixedSize<float, Size<3, 4>> a;
+ Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
cout << "Rank: " << a.rank() << endl;
=> Rank: 2
cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
@@ -584,11 +584,11 @@ until the TensorMap is discarded, and the size of the data must be large enough
to accomodate of the coefficients of the tensor.
float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
- Eigen::TensorMap<float, 2> a(data, 3, 4);
+ Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
=> NumRows: 3 NumCols: 4
cout << "a(1, 2): " << a(1, 2) << endl;
- => a(1, 2): 9
+ => a(1, 2): 7
## Contents Initialization
@@ -1016,13 +1016,20 @@ multidimensional case.
a.setValues({{1, 2}, {4, 5}, {5, 6}});
// Compute the traditional matrix product
- array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) };
+ Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair(1, 0) };
Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
// Compute the product of the transpose of the matrices
- array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) };
+ Eigen::array<Eigen::IndexPair<int>, 1> transpose_product_dims = { Eigen::IndexPair(0, 1) };
Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
-
+
+ // Contraction to scalar value using a ouble contraction
+ // First coordinate of both tensors are contracted as well as both second coordinates
+ Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
+ Eigen::Tensor<int, 0> AdoubleontractedA = a.contract(a, double_contraction_product_dims);
+
+ // Extracting the scalar value of the tensor contraction for further usage
+ int value = AdoublecontractedA(0);
## Reduction Operations
@@ -1168,6 +1175,58 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer```
in TensorFunctors.h for information on how to implement a reduction operator.
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the ```Reduction Dimensions```, the trace dimensions
+are passed as an input parameter to the operation, are of type ```<TensorType>::Dimensions```
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+ // Create a tensor of 3 dimensions
+ Eigen::Tensor<int, 3> a(2, 2, 3);
+ a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+ // Specify the dimensions along which the trace will be computed.
+ // In this example, the trace can only be computed along the dimensions
+ // with indices 0 and 1
+ Eigen::array<int, 2> dims({0, 1});
+ // The output tensor contains all but the trace dimensions.
+ Tensor<int, 1> a_trace = a.trace(dims);
+ cout << "a_trace:" << endl;
+ cout << a_trace << endl;
+ =>
+ a_trace:
+ 11
+ 13
+ 15
+
+
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+ // Create a tensor of 3 dimensions, with all dimensions having the same size.
+ Eigen::Tensor<int, 3> a(3, 3, 3);
+ a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+ {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+ {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+ // Result is a zero dimension tensor
+ Tensor<int, 0> a_trace = a.trace();
+ cout<<"a_trace:"<<endl;
+ cout<<a_trace<<endl;
+ =>
+ a_trace:
+ 42
+
+
## Scan Operations
A *Scan* operation returns a tensor with the same dimensions as the original
@@ -1314,7 +1373,7 @@ The previous example can be rewritten as follow:
Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
- Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
b.reshape(two_dim) = a;
cout << "b" << endl << b << endl;
=>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index d06f40cd8..c0f33ba2d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -119,6 +119,12 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+ return m_impl;
+ }
+#endif
+
protected:
TensorEvaluator<ArgType, Device> m_impl;
};
@@ -172,7 +178,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
const ReduceOp& reduce_op,
- const int return_dim,
+ const Index return_dim,
const Dims& reduce_dims)
: m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
@@ -187,12 +193,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
const Dims& reduce_dims() const { return m_reduce_dims; }
EIGEN_DEVICE_FUNC
- int return_dim() const { return m_return_dim; }
+ Index return_dim() const { return m_return_dim; }
protected:
typename XprType::Nested m_xpr;
const ReduceOp m_reduce_op;
- const int m_return_dim;
+ const Index m_return_dim;
const Dims m_reduce_dims;
};
@@ -222,7 +228,11 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
- m_return_dim(op.return_dim()) {
+ m_return_dim(op.return_dim())
+#ifdef EIGEN_USE_SYCL
+ ,m_device(device)
+#endif
+ {
gen_strides(m_orig_impl.dimensions(), m_strides);
if (Layout == static_cast<int>(ColMajor)) {
@@ -252,7 +262,16 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
}
+ #ifndef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ #else // following functions are required by sycl
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}
+ const Device& device() const{return m_device;}
+ #endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
@@ -288,10 +307,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
protected:
TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
- const int m_return_dim;
+ const Index m_return_dim;
StrideDims m_strides;
Index m_stride_mod;
Index m_stride_div;
+#ifdef EIGEN_USE_SYCL
+ const Device& m_device;
+#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
new file mode 100644
index 000000000..442639868
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorArgMaxSycl.h
+ * \brief:
+ * TensorArgMaxSycl
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
+namespace Eigen {
+namespace internal {
+ template<typename Dims, typename XprType>
+ struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
+ {
+ typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
+ };
+
+ template<typename Dims, typename XprType>
+ struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
+ typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
+ {
+ typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
+ };
+
+template<typename StrideDims, typename XprType>
+struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
+{
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef Index Scalar;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+
+}// end namespace internal
+template<typename StrideDims, typename XprType>
+class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
+ typedef typename XprType::CoeffReturnType TupleType;
+ typedef Index CoeffReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
+ const Index return_dim,
+ const StrideDims strides,
+ const Index stride_mod, const Index stride_div)
+ :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ Index return_dim() const { return m_return_dim; }
+
+ EIGEN_DEVICE_FUNC
+ const StrideDims& strides() const { return m_strides; }
+
+ EIGEN_DEVICE_FUNC
+ const Index& stride_mod() const { return m_stride_mod; }
+
+ EIGEN_DEVICE_FUNC
+ const Index& stride_div() const { return m_stride_div; }
+
+ protected:
+ typename Eigen::internal::remove_all<typename
+ XprType::Nested
+ >::type m_xpr;
+ const Index m_return_dim;
+ const StrideDims m_strides;
+ const Index m_stride_mod;
+ const Index m_stride_div;
+};
+
+
+// Eval as rvalue
+template<typename StrideDims, typename ArgType>
+struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
+{
+ typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::TupleType TupleType;
+ typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
+ : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
+ m_stride_div(op.stride_div()){}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ const TupleType v = m_impl.coeff(index);
+ return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+ }
+typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
+
+protected:
+ TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
+ const Index m_return_dim;
+ const StrideDims m_strides;
+ const Index m_stride_mod;
+ const Index m_stride_div;
+};
+} // end namespace Eigen
+#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 166be200c..027305586 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -34,6 +34,7 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
static const int Layout = internal::traits<LhsXprType>::Layout;
+ typedef typename traits<LhsXprType>::PointerType PointerType;
enum {
Flags = 0
@@ -168,7 +169,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
/// required by sycl in order to extract the accessor
const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_leftImpl.data(); }
private:
TensorEvaluator<LeftArgType, Device> m_leftImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index fbe340820..0d6331e9c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const array<Index, NumDimensions>, const Derived>
argmax() const {
array<Index, NumDimensions> in_dims;
- for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+ for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
return TensorTupleReducerOp<
internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
const array<Index, NumDimensions>,
@@ -632,7 +632,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const array<Index, NumDimensions>, const Derived>
argmin() const {
array<Index, NumDimensions> in_dims;
- for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+ for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
return TensorTupleReducerOp<
internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
const array<Index, NumDimensions>,
@@ -643,7 +643,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const TensorTupleReducerOp<
internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
const array<Index, 1>, const Derived>
- argmax(const int return_dim) const {
+ argmax(const Index return_dim) const {
array<Index, 1> in_dims;
in_dims[0] = return_dim;
return TensorTupleReducerOp<
@@ -656,7 +656,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const TensorTupleReducerOp<
internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
const array<Index, 1>, const Derived>
- argmin(const int return_dim) const {
+ argmin(const Index return_dim) const {
array<Index, 1> in_dims;
in_dims[0] = return_dim;
return TensorTupleReducerOp<
@@ -671,6 +671,18 @@ class TensorBase<Derived, ReadOnlyAccessors>
return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
}
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTraceOp<const Dims, const Derived>
+ trace(const Dims& dims) const {
+ return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+ }
+
+ const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+ trace() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+ }
+
template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorBroadcastingOp<const Broadcast, const Derived>
broadcast(const Broadcast& broadcast) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 23a74460e..b6c93aff9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -31,6 +31,7 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Broadcast, typename XprType>
@@ -372,7 +373,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c46a778b5..21ffa2872 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -32,6 +32,7 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions - 1;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<DenseIndex DimId, typename XprType>
@@ -264,7 +265,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
TensorOpCost(0, 0, cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 2c7ba961c..a7c1380b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -37,6 +37,8 @@ struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
static const int Layout = traits<LhsXprType>::Layout;
enum { Flags = 0 };
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
};
template<typename Axis, typename LhsXprType, typename RhsXprType>
@@ -275,7 +277,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
TensorOpCost(0, 0, compute_cost);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index bf4a476d9..e72ddb4a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -104,6 +104,8 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
// From NumDims below.
static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
static const int Layout = traits<LhsXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
enum {
Flags = 0
@@ -609,7 +611,7 @@ struct TensorContractionEvaluatorBase
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; }
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index c04b784a4..903bc51cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -12,7 +12,7 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
namespace Eigen {
@@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation.
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
#define reduceRow(i, mask) \
shuffleInc(i, 0, mask); \
@@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x;
x2 = rhs_pf0.z;
}
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4);
+ #else
+ x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+ x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+ #endif
if((threadIdx.x%8) < 4) {
rhs_pf0.y = x1;
rhs_pf0.w = x2;
@@ -1382,5 +1391,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
} // end namespace Eigen
-#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_USE_GPU and EIGEN_CUDACC
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index e87de0c57..e6840bc87 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -11,7 +11,7 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
+ * TensorTensorContractionsycl.h
*
* \brief:
* TensorContractionsycl
@@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
+ if (data) {
evalTo(data);
return false;
} else {
@@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
LhsLocalAcc localLhs;
RhsLocalAcc localRhs;
OutAccessor out_res;
+ size_t out_offset;
Index roundUpK, M, N, K;
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
LeftNocontractT m_i_strides, m_left_nocontract_strides;
@@ -182,18 +183,19 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
Device dev;
- KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+ KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
- :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+ :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
+ out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
m_right_contracting_strides(m_right_contracting_strides_),
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
+ void operator()(cl::sycl::nd_item<2> itemID) {
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
@@ -230,13 +232,13 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
// Allocate register space
- float privateLhs;
- float privateRhs[WorkLoadPerThreadN];
- float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
+ LhsScalar privateLhs;
+ RhsScalar privateRhs[WorkLoadPerThreadN];
+ OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
// Initialise the privateResumulation registers
for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] = 0.0f;
+ privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0);
}
}
@@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
if(globalCol<N)
- out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
+ out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
}
}
}
@@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
// extract lhs functor list
LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
// extract rhs functor list
- RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+ RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
Index roundUpK = RoundUp(K, TileSizeDimK);
Index roundUpM = RoundUp(M, TileSizeDimM);
Index roundUpN = RoundUp(N, TileSizeDimN);
-
+ ptrdiff_t out_offset = self.device().get_offset(buffer);
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
/// work-around for gcc bug
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
@@ -379,18 +381,17 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
//OutScalar memory
- OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
+ OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
// sycl parallel for
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
- WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
- localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
- m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
+ WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors,
+ localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
+ m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice()));
});
self.device().asynchronousExec();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index b29968b63..182bef918 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -32,6 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> >
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = traits<XprType>::Layout;
enum { Flags = 0 };
+ typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
};
template<typename TargetType, typename XprType>
@@ -244,7 +245,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
}
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the sycl accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 378f5cccb..84d5be173 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -231,6 +231,8 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<InputXprType>::NumDimensions;
static const int Layout = traits<InputXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+ typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
enum {
Flags = 0
@@ -465,7 +467,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
PacketSize));
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -551,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int StaticKernelSize>
struct GetKernelSize {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 4247c1c4a..da88bcb3b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -32,19 +32,20 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize, range_x, range_y;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<2> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@@ -89,19 +90,20 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
}
};
@@ -156,21 +158,22 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La
Kernel_accessor kernel_filter;
const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
Local_accessor local_acc;
FunctorExpr functors;
TupleType tuple_of_accessors;
EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+ Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
}
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
- buffer_ptr[tensor_index] = result;
+ buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
@@ -297,7 +300,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
/// used by sycl in order to build the sycl buffer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
/// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
@@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_kernel = in_place;
m_local_kernel = false;
} else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
@@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
// extract input functor list
InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+ ptrdiff_t out_offset = m_device.get_offset(data);
m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
@@ -335,8 +339,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// create input tuple of accessors
InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
- OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
+ OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
@@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
- numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+ numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
break;
}
@@ -421,6 +425,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
}
});
+ m_device.asynchronousExec();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 83c449cf1..b148dae39 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -174,8 +174,10 @@ class TensorCostModel {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
double cost = totalCost(output_size, cost_per_coeff);
- int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
- return numext::mini(max_threads, numext::maxi(1, threads));
+ double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+ // Make sure we don't invoke undefined behavior when we convert to an int.
+ threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+ return numext::mini(max_threads, numext::maxi<int>(1, threads));
}
// taskSize assesses parallel task size.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index e020d076f..0e4db46de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -30,6 +30,7 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = traits<XprType>::Layout;
+ typedef typename traits<XprType>::PointerType PointerType;
};
template<typename CustomUnaryFunc, typename XprType>
@@ -138,7 +139,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+#endif
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
@@ -180,6 +185,8 @@ struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
static const int Layout = traits<LhsXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
};
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
@@ -293,7 +300,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+ EIGEN_DEVICE_FUNC typename internal::traits<XprType>::PointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+#endif
protected:
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index be8d69386..ded7129da 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -211,7 +211,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
@@ -239,7 +239,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
@@ -265,7 +265,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
if (err != cudaSuccess) {
std::cerr << "Error detected in CUDA stream: "
@@ -304,7 +304,7 @@ struct GpuDevice {
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
#else
@@ -323,9 +323,9 @@ struct GpuDevice {
// FIXME: Should be device and kernel specific.
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6cb2..341889e88 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -35,7 +35,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
// Running on the host CPU
return 1;
#else
@@ -45,7 +45,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running on the host CPU
return l1CacheSize();
#else
@@ -55,7 +55,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running single threaded on the host CPU
return l3CacheSize();
#else
@@ -65,13 +65,13 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#else
// Running on a CUDA device
- return __CUDA_ARCH__ / 100;
+ return EIGEN_CUDA_ARCH / 100;
#endif
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e209799bb..6158acbd9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,10 +14,41 @@
#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+template<size_t Align> struct CheckAlignStatically {
+ static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *)));
+};
+template <bool IsAligned, size_t Align>
+struct Conditional_Allocate {
+
+ EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) {
+ return aligned_alloc(Align, elements);
+ }
+};
+template <size_t Align>
+struct Conditional_Allocate<false, Align> {
+
+ EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){
+ return malloc(elements);
+ }
+};
+template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>>
+struct SyclAllocator {
+ typedef Scalar value_type;
+ typedef typename std::allocator_traits<Allocator>::pointer pointer;
+ typedef typename std::allocator_traits<Allocator>::size_type size_type;
+
+ SyclAllocator( ){};
+ Scalar* allocate(std::size_t elements) {
+ return static_cast<Scalar*>(Conditional_Allocate<CheckAlignStatically<Align>::Val, Align>::conditional_allocate(elements));
+ }
+ void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); }
+};
namespace Eigen {
- #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
+#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast<Scalar*>(static_cast<void*>(((buf_acc.get_pointer().get()))))
+#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
+
template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
public:
@@ -40,47 +71,58 @@ namespace Eigen {
size_t m_offset;
};
+template<typename AccType>
struct memsetkernelFunctor{
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
AccType m_acc;
+ const ptrdiff_t buff_offset;
const size_t m_rng, m_c;
- memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
+ memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){}
void operator()(cl::sycl::nd_item<1> itemID) {
auto globalid=itemID.get_global_linear_id();
- if (globalid< m_rng) m_acc[globalid] = m_c;
+ if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
}
};
+struct memsetCghFunctor{
+ cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf;
+ const ptrdiff_t& buff_offset;
+ const size_t& rng , GRange, tileSize;
+ const int &c;
+ memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
+ :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
+
+ void operator()(cl::sycl::handler &cgh) const {
+ auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+ typedef decltype(buf_acc) AccType;
+ cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c));
+ }
+};
+
+//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU)
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
- auto devices = cl::sycl::device::get_devices();
- std::vector<cl::sycl::device>::iterator it =devices.begin();
- while(it!=devices.end()) {
- /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
- auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
- it=devices.erase(it);
- }
- else{
- ++it;
+std::vector<cl::sycl::device> supported_devices;
+auto plafrom_list =cl::sycl::platform::get_platforms();
+for(const auto& platform : plafrom_list){
+ auto device_list = platform.get_devices();
+ auto platform_name =platform.template get_info<cl::sycl::info::platform::name>();
+ std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
+ for(const auto& device : device_list){
+ auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+ std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+ bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) ||
+ (device.is_gpu() && platform_name.find("intel")!=std::string::npos);
+ if(!unsuported_condition){
+ std::cout << "Platform name "<< platform_name << std::endl;
+ supported_devices.push_back(device);
}
}
- return devices;
+}
+return supported_devices;
}
-struct QueueInterface {
- /// class members:
- bool exception_caught_ = false;
-
- mutable std::mutex mutex_;
-
- /// std::map is the container used to make sure that we create only one buffer
- /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
- /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
- mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
- /// sycl queue
- mutable cl::sycl::queue m_queue;
+class QueueInterface {
+public:
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
/// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
@@ -116,11 +158,11 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
/// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
/// The device pointer would be deleted by calling deallocate function.
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
- auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
+ std::lock_guard<std::mutex> lock(mutex_);
+ auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes));
auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
buf.set_final_data(nullptr);
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
+ buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf));
return static_cast<void*>(ptr);
}
@@ -138,62 +180,113 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
std::lock_guard<std::mutex> lock(mutex_);
buffer_map.clear();
}
-
- EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
- if (it1 != buffer_map.end()){
- return it1;
- }
- else{
- for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
- auto size = it->second.get_size();
- if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
- }
- }
- std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
- abort();
+ /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
+ /// pointer created as a key we find the sycl buffer and get the host accessor with write mode
+ /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that
+ /// this buffer is accessed, the data will be copied to the device.
+ /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark
+ /// Also, this is faster as it uses the map_allocator instead of memcpy
+ template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
+ auto it =find_buffer(dst);
+ auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first;
+ offset/=sizeof(Index);
+ size_t rng, GRange, tileSize;
+ parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
+ auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n));
+ m_queue.submit([&](cl::sycl::handler &cgh) {
+ auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+ auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
+ typedef decltype(src_acc) read_accessor;
+ typedef decltype(dst_acc) write_accessor;
+ cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0));
+ });
+ synchronize();
}
-
- // This function checks if the runtime recorded an error for the
- // underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- if (!exception_caught_) {
- m_queue.wait_and_throw();
- }
- return !exception_caught_;
+ /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
+ /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
+ /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
+ /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
+ /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
+ /// to the cpu only once per function call.
+ template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
+ auto it =find_buffer(src);
+ auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
+ offset/=sizeof(Index);
+ size_t rng, GRange, tileSize;
+ parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
+ auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
+ m_queue.submit([&](cl::sycl::handler &cgh) {
+ auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
+ auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
+ typedef decltype(src_acc) read_accessor;
+ typedef decltype(dst_acc) write_accessor;
+ cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
+ });
+ synchronize();
}
- // destructor
- ~QueueInterface() { buffer_map.clear(); }
-};
+ /// the memcpy function
+ template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+ auto it1 = find_buffer(static_cast<const void*>(src));
+ auto it2 = find_buffer(dst);
+ auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
+ auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
+ offset/=sizeof(Index);
+ i/=sizeof(Index);
+ size_t rng, GRange, tileSize;
+ parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
+ m_queue.submit([&](cl::sycl::handler &cgh) {
+ auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
+ auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+ typedef decltype(src_acc) read_accessor;
+ typedef decltype(dst_acc) write_accessor;
+ cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
+ });
+ synchronize();
+ }
-struct SyclDevice {
- // class member.
- QueueInterface* m_queue_stream;
- /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
- explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
+ EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+ size_t rng, GRange, tileSize;
+ parallel_for_setup(n, tileSize, rng, GRange);
+ auto it1 = find_buffer(static_cast<const void*>(data));
+ ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
+ m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
+ synchronize();
+ }
/// Creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if not,
/// the function then adds an entry by creating a sycl buffer for that particular pointer.
template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
- return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+ return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
}
/// Accessing the created sycl device buffer for the device pointer
- EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
- return m_queue_stream->find_buffer(ptr)->second;
+ EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
+ return find_buffer(ptr)->second;
+ }
+
+ EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+ return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first);
+ }
+
+ EIGEN_STRONG_INLINE void synchronize() const {
+ m_queue.wait_and_throw(); //pass
+ }
+
+ EIGEN_STRONG_INLINE void asynchronousExec() const {
+ ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
+ //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled
+ m_queue.wait_and_throw(); //pass
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
- auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
+ tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
+ auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
}
rng = n;
@@ -210,7 +303,7 @@ struct SyclDevice {
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -234,13 +327,11 @@ struct SyclDevice {
}
}
-
-
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+ if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -273,6 +364,108 @@ struct SyclDevice {
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
}
+
+ EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+ return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+ }
+
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+ return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+ }
+
+ /// No need for sycl it should act the same as CPU version
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+ // OpenCL doesnot have such concept
+ return 2;
+ }
+
+ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+ return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;}
+
+ // This function checks if the runtime recorded an error for the
+ // underlying stream device.
+ EIGEN_STRONG_INLINE bool ok() const {
+ if (!exception_caught_) {
+ m_queue.wait_and_throw();
+ }
+ return !exception_caught_;
+ }
+
+ // destructor
+ ~QueueInterface() { buffer_map.clear(); }
+
+private:
+ /// class members:
+ bool exception_caught_ = false;
+
+ mutable std::mutex mutex_;
+
+ /// std::map is the container used to make sure that we create only one buffer
+ /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
+ /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
+ mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map;
+ /// sycl queue
+ mutable cl::sycl::queue m_queue;
+
+ EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const {
+ std::lock_guard<std::mutex> lock(mutex_);
+ auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
+ if (it1 != buffer_map.end()){
+ return it1;
+ }
+ else{
+ for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
+ auto size = it->second.get_size();
+ if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
+ }
+ }
+ std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
+ abort();
+ }
+};
+
+// Here is a sycl deviuce struct which accept the sycl queue interface
+// as an input
+struct SyclDevice {
+ // class member.
+ QueueInterface* m_queue_stream;
+ /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
+ explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
+
+ // get sycl accessor
+ template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
+ return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr);
+ }
+
+ /// Accessing the created sycl device buffer for the device pointer
+ EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const {
+ return m_queue_stream->get_sycl_buffer(ptr);
+ }
+
+ /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+ template<typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
+ m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange);
+ }
+
+ /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+ template<typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
+ m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1);
+ }
+
+ /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+ template<typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
+ m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2);
+
+ }
/// allocate device memory
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
return m_queue_stream->allocate(num_bytes);
@@ -287,78 +480,27 @@ struct SyclDevice {
/// the memcpy function
template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
- auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
- auto it2 = m_queue_stream->find_buffer(dst);
- auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
- auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
- offset/=sizeof(Index);
- i/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
- });
- synchronize();
+ m_queue_stream->memcpy(dst,src,n);
}
- /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
- /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
- /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
- /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
- /// this buffer is accessed, the data will be copied to the device.
+ EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+ return m_queue_stream->get_offset(ptr);
+
+ }
+// memcpyHostToDevice
template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
- auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
- ::memcpy(host_acc.get_pointer(), src, n);
+ m_queue_stream->memcpyHostToDevice(dst,src,n);
}
- /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
- /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
- /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
- /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
- /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
- /// to the cpu only once per function call.
+/// here is the memcpyDeviceToHost
template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
- auto it = m_queue_stream->find_buffer(src);
- auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
- offset/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- // Assuming that the dst is the start of the destination pointer
- auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
- });
- synchronize();
+ m_queue_stream->memcpyDeviceToHost(dst,src,n);
}
- /// returning the sycl queue
- EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
/// Here is the implementation of memset function on sycl.
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
- size_t rng, GRange, tileSize;
- parallel_for_setup(n, tileSize, rng, GRange);
- sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
- synchronize();
+ m_queue_stream->memset(data,c,n);
}
-
- struct memsetCghFunctor{
- cl::sycl::buffer<uint8_t, 1>& m_buf;
- const size_t& rng , GRange, tileSize;
- const int &c;
- memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
- :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
- void operator()(cl::sycl::handler &cgh) const {
- auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
- }
- };
+ /// returning the sycl queue
+ EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();}
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
@@ -367,39 +509,32 @@ struct SyclDevice {
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
- // there is no l3 cache on cuda devices.
+ // there is no l3 cache on sycl devices.
return firstLevelCacheSize();
}
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
- // return stream_->deviceProperties().multiProcessorCount;
+ return m_queue_stream->getNumSyclMultiProcessors();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
-
- // return stream_->deviceProperties().maxThreadsPerBlock;
+ return m_queue_stream->maxSyclThreadsPerBlock();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
// OpenCL doesnot have such concept
- return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+ return m_queue_stream->maxSyclThreadsPerMultiProcessor();
// return stream_->deviceProperties().maxThreadsPerMultiProcessor;
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
- // return stream_->deviceProperties().sharedMemPerBlock;
+ return m_queue_stream->sharedMemPerBlock();
}
/// No need for sycl it should act the same as CPU version
- EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); }
EIGEN_STRONG_INLINE void synchronize() const {
- sycl_queue().wait_and_throw(); //pass
+ m_queue_stream->synchronize(); //pass
}
EIGEN_STRONG_INLINE void asynchronousExec() const {
- ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
- //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
- sycl_queue().wait_and_throw(); //pass
-
+ m_queue_stream->asynchronousExec();
}
// This function checks if the runtime recorded an error for the
// underlying stream device.
@@ -407,8 +542,10 @@ struct SyclDevice {
return m_queue_stream->ok();
}
};
-
-
+// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
+// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
+// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
+struct SyclKernelDevice:DefaultDevice{};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca69..ec6802e85 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -154,7 +154,11 @@ struct ThreadPoolDevice {
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
- pool_->Schedule(std::bind(f, args...));
+ if (sizeof...(args) > 0) {
+ pool_->Schedule(std::bind(f, args...));
+ } else {
+ pool_->Schedule(f);
+ }
}
// Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 82dd1e640..d0c027890 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -32,6 +32,7 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename MakePointer_<Scalar>::Type PointerType;
enum {
Flags = 0
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d6415817b..2264be391 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -131,7 +131,7 @@ T loadConstant(const T* address) {
return *address;
}
// Use the texture cache on CUDA devices whenever possible
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float loadConstant(const float* address) {
return __ldg(address);
@@ -193,7 +193,12 @@ struct TensorEvaluator<const Derived, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data);
+#ifndef __SYCL_DEVICE_ONLY__
return loadConstant(m_data+index);
+#else
+ CoeffReturnType tmp = m_data[index];
+ return tmp;
+#endif
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -278,7 +283,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
internal::unpacket_traits<PacketReturnType>::size);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
@@ -348,7 +353,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
@@ -428,7 +433,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor
@@ -528,7 +533,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
@@ -620,7 +625,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
/// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f01d77c0a..0ffe68ab3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
};
-#if defined(__CUDACC__)
+#if defined(EIGEN_CUDACC)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
@@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
evaluator.cleanup();
}
-#endif // __CUDACC__
+#endif // EIGEN_CUDACC
#endif // EIGEN_USE_GPU
// SYCL Executor policy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 85dfc7a69..4b6540c07 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -38,7 +38,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
-
+ typedef typename XprTraits::PointerType PointerType;
enum {
Flags = 0
};
@@ -89,6 +89,7 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType;
};
template<typename UnaryOp, typename XprType>
@@ -161,7 +162,11 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
-
+ typedef typename TypeConversion<Scalar,
+ typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType,
+ typename traits<RhsXprType>::PointerType>::type
+ >::type PointerType;
enum {
Flags = 0
};
@@ -238,7 +243,11 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
-
+ typedef typename TypeConversion<Scalar,
+ typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+ typename traits<Arg2XprType>::PointerType,
+ typename traits<Arg3XprType>::PointerType>::type
+ >::type PointerType;
enum {
Flags = 0
};
@@ -314,6 +323,9 @@ struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
typedef typename ElseXprType::Nested ElseNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+ typename traits<ThenXprType>::PointerType,
+ typename traits<ElseXprType>::PointerType>::type PointerType;
};
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f060191ab..10e0a8a6b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -71,6 +71,7 @@ struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename traits<XprType>::PointerType PointerType;
};
template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
@@ -234,7 +235,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
if (line_len > 1) {
const RealScalar pi_over_len(EIGEN_PI / line_len);
const ComplexScalar pos_j_base = ComplexScalar(
- std::cos(pi_over_len), std::sin(pi_over_len));
+ std::cos(pi_over_len), std::sin(pi_over_len));
pos_j_base_powered[1] = pos_j_base;
if (line_len > 2) {
const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index fcee5f60d..e943757ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -20,7 +20,7 @@ namespace Eigen {
* The fixed sized equivalent of
* Eigen::Tensor<float, 3> t(3, 5, 7);
* is
- * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+ * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
*/
template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index abe85c860..c015ce196 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -38,6 +38,7 @@ struct traits<TensorForcedEvalOp<XprType> >
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
enum {
Flags = 0
@@ -143,7 +144,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buffer; }
/// required by sycl in order to extract the sycl accessor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 2e638992a..354bbe8d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -22,6 +22,22 @@ template<typename T> struct MakePointer {
typedef T* Type;
typedef T& RefType;
};
+
+namespace internal{
+template<typename A, typename B> struct Pointer_type_promotion {
+ static const bool val=false;
+};
+template<typename A> struct Pointer_type_promotion<A, A> {
+ static const bool val = true;
+};
+template<typename A, typename B> struct TypeConversion;
+#ifndef __SYCL_DEVICE_ONLY__
+template<typename A, typename B> struct TypeConversion{
+ typedef A* type;
+};
+#endif
+}
+
#if defined(EIGEN_USE_SYCL)
namespace TensorSycl {
namespace internal{
@@ -70,6 +86,7 @@ template<typename Strides, typename XprType> class TensorInflationOp;
template<typename Generator, typename XprType> class TensorGeneratorOp;
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
template<typename Op, typename XprType> class TensorScanOp;
+template<typename Dims, typename XprType> class TensorTraceOp;
template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3b4f8eda1..5dcc3794c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -166,7 +166,8 @@ template <typename T> struct MeanReducer
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
- return accum / scalarCount_;
+ internal::scalar_quotient_op<T> quotient_op;
+ return quotient_op(accum, T(scalarCount_));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
@@ -175,7 +176,10 @@ template <typename T> struct MeanReducer
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
internal::scalar_sum_op<T> sum_op;
- return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+ internal::scalar_quotient_op<T> quotient_op;
+ return quotient_op(
+ sum_op(saccum, predux(vaccum)),
+ T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
}
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index eb1d4934e..fa269b8c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -31,6 +31,7 @@ struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Generator, typename XprType>
@@ -98,9 +99,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_generator(op.generator())
+#ifdef EIGEN_USE_SYCL
+ , m_argImpl(op.expression(), device)
+#endif
{
- TensorEvaluator<ArgType, Device> impl(op.expression(), device);
- m_dimensions = impl.dimensions();
+ TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+ m_dimensions = argImpl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
@@ -153,7 +157,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
TensorOpCost::MulCost<Scalar>());
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; }
+#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -178,6 +187,9 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
Generator m_generator;
+#ifdef EIGEN_USE_SYCL
+ TensorEvaluator<ArgType, Device> m_argImpl;
+#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 566856ed2..3c6a2e091 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -27,6 +27,7 @@ namespace Eigen {
* patch_cols, and 1 for all the additional dimensions.
*/
namespace internal {
+
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
{
@@ -38,6 +39,7 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -70,12 +72,12 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
PaddingType padding_type, Scalar padding_value)
- : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
- m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
- m_padding_type(padding_type), m_padding_value(padding_value) {}
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+ m_padding_type(padding_type), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex row_strides, DenseIndex col_strides,
@@ -84,13 +86,31 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
DenseIndex padding_top, DenseIndex padding_bottom,
DenseIndex padding_left, DenseIndex padding_right,
Scalar padding_value)
- : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
- m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
- m_padding_left(padding_left), m_padding_right(padding_right),
- m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+ m_padding_left(padding_left), m_padding_right(padding_right),
+ m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ bool padding_explicit, DenseIndex padding_top, DenseIndex padding_bottom,
+ DenseIndex padding_left, DenseIndex padding_right, PaddingType padding_type,
+ Scalar padding_value)
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(padding_explicit), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+ m_padding_left(padding_left), m_padding_right(padding_right),
+ m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+#endif
EIGEN_DEVICE_FUNC
DenseIndex patch_rows() const { return m_patch_rows; }
@@ -171,8 +191,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
RawAccess = false
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ #ifdef __SYCL_DEVICE_ONLY__
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
+ #else
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+ #endif
: m_impl(op.expression(), device)
+#ifdef EIGEN_USE_SYCL
+ , m_op(op)
+#endif
{
EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -241,6 +268,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
break;
default:
eigen_assert(false && "unexpected padding");
+ m_outputCols=0; // silence the uninitialised warnig;
+ m_outputRows=0; //// silence the uninitialised warnig;
}
}
eigen_assert(m_outputRows > 0);
@@ -418,9 +447,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
return packetWithPossibleZero(index);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+ // Required by SYCL in order to construct the expression tree on the device
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
+#endif
Index rowPaddingTop() const { return m_rowPaddingTop; }
Index colPaddingLeft() const { return m_colPaddingLeft; }
@@ -501,6 +535,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
Scalar m_paddingValue;
TensorEvaluator<ArgType, Device> m_impl;
+ #ifdef EIGEN_USE_SYCL
+ // Required for SYCL in order to construct the expression tree on the device
+ XprType m_op;
+ #endif
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index f391fb9ee..6147fbdf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -31,6 +31,7 @@ struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Strides, typename XprType>
@@ -213,7 +214,12 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; }
+#endif
protected:
Dimensions m_dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ef1c9c42c..fb6454623 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -35,7 +35,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return __clz(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@@ -53,7 +53,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
{
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return __clzll(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@@ -90,7 +90,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
return __umulhi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
@@ -101,7 +101,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
return __umul64hi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
@@ -124,7 +124,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return (__umulhi(magic, n) >> shift);
#elif defined(__SYCL_DEVICE_ONLY__)
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index cd0109ef4..4e384f9b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -46,6 +46,7 @@ struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename XprType>
@@ -159,7 +160,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
return m_impl.costPerCoeff(vectorized);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_impl.data(); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index f92e39d69..c9e61f359 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
*/
// SFINAE requires variadic templates
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b5ef31d55..5431eb740 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;
@@ -124,7 +124,9 @@ template <typename U, typename V> struct Tuple {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Tuple& operator= (const Tuple& rhs) {
+ #ifndef __SYCL_DEVICE_ONLY__
if (&rhs == this) return *this;
+ #endif
first = rhs.first;
second = rhs.second;
return *this;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddd2ca18..329655817 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -31,6 +31,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = array_size<NewDimensions>::value;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename NewDimensions, typename XprType>
@@ -146,7 +147,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
return m_impl.costPerCoeff(vectorized);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -214,6 +215,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = array_size<StartIndices>::value;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename StartIndices, typename Sizes, typename XprType>
@@ -468,7 +470,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
Scalar* result = m_impl.data();
if (result) {
Index offset = 0;
@@ -633,6 +635,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = array_size<StartIndices>::value;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -823,7 +826,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
return NULL;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a8e255246..5956e513d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -31,6 +31,7 @@ struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprT
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename PaddingDimensions, typename XprType>
@@ -198,7 +199,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
return cost;
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// used by sycl
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 886a254f6..9e0a20abf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -31,6 +31,7 @@ struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename PatchDim, typename XprType>
@@ -100,6 +101,9 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
+#ifdef EIGEN_USE_SYCL
+ , m_patch_dims(op.patch_dims())
+#endif
{
Index num_patches = 1;
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -253,7 +257,12 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; }
+#endif
protected:
Dimensions m_dimensions;
@@ -262,6 +271,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
array<Index, NumDims-1> m_patchStrides;
TensorEvaluator<ArgType, Device> m_impl;
+
+#ifdef EIGEN_USE_SYCL
+ const PatchDim m_patch_dims;
+#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 1655a813e..230915db2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -16,7 +16,7 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);
@@ -55,11 +55,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#endif
}
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
// TODO: Unify with the implementation in the non blocking thread pool.
uint64_t current = *state;
// Update the internal state
- *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ *state = current * 6364136223846793005ULL + (stream << 1 | 1);
// Generate the random output (using the PCG-XSH-RS scheme)
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
@@ -73,17 +73,17 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state) {
- unsigned rnd = PCG_XSH_RS_generator(state);
+T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+ unsigned rnd = PCG_XSH_RS_generator(state, stream);
return static_cast<T>(rnd);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
Eigen::half result;
// Generate 10 random bits for the mantissa
- unsigned rnd = PCG_XSH_RS_generator(state);
+ unsigned rnd = PCG_XSH_RS_generator(state, stream);
result.x = static_cast<uint16_t>(rnd & 0x3ffu);
// Set the exponent
result.x |= (static_cast<uint16_t>(15) << 10);
@@ -93,14 +93,14 @@ Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state) {
+float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
typedef union {
uint32_t raw;
float fp;
} internal;
internal result;
// Generate 23 random bits for the mantissa mantissa
- const unsigned rnd = PCG_XSH_RS_generator(state);
+ const unsigned rnd = PCG_XSH_RS_generator(state, stream);
result.raw = rnd & 0x7fffffu;
// Set the exponent
result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -109,7 +109,7 @@ float RandomToTypeUniform<float>(uint64_t* state) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state) {
+double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
typedef union {
uint64_t raw;
double dp;
@@ -118,9 +118,9 @@ double RandomToTypeUniform<double>(uint64_t* state) {
result.raw = 0;
// Generate 52 random bits for the mantissa
// First generate the upper 20 bits
- unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+ unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
// The generate the lower 32 bits
- unsigned rnd2 = PCG_XSH_RS_generator(state);
+ unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
// Set the exponent
result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -129,14 +129,14 @@ double RandomToTypeUniform<double>(uint64_t* state) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
- return std::complex<float>(RandomToTypeUniform<float>(state),
- RandomToTypeUniform<float>(state));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
+ return std::complex<float>(RandomToTypeUniform<float>(state, stream),
+ RandomToTypeUniform<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
- return std::complex<double>(RandomToTypeUniform<double>(state),
- RandomToTypeUniform<double>(state));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
+ return std::complex<double>(RandomToTypeUniform<double>(state, stream),
+ RandomToTypeUniform<double>(state, stream));
}
template <typename T> class UniformRandomGenerator {
@@ -155,9 +155,7 @@ template <typename T> class UniformRandomGenerator {
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
- uint64_t local_state = m_state + i;
- T result = RandomToTypeUniform<T>(&local_state);
- m_state = local_state;
+ T result = RandomToTypeUniform<T>(&m_state, i);
return result;
}
@@ -165,11 +163,9 @@ template <typename T> class UniformRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
- uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
- values[j] = RandomToTypeUniform<T>(&local_state);
+ values[j] = RandomToTypeUniform<T>(&m_state, i);
}
- m_state = local_state;
return internal::pload<Packet>(values);
}
@@ -190,14 +186,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > {
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state) {
+T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
// Use the ratio of uniform method to generate numbers following a normal
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
// details.
T u, v, q;
do {
- u = RandomToTypeUniform<T>(state);
- v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+ u = RandomToTypeUniform<T>(state, stream);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
const T x = u - T(0.449871);
const T y = numext::abs(v) + T(0.386595);
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -208,14 +204,14 @@ T RandomToTypeNormal(uint64_t* state) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
- return std::complex<float>(RandomToTypeNormal<float>(state),
- RandomToTypeNormal<float>(state));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
+ return std::complex<float>(RandomToTypeNormal<float>(state, stream),
+ RandomToTypeNormal<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
- return std::complex<double>(RandomToTypeNormal<double>(state),
- RandomToTypeNormal<double>(state));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
+ return std::complex<double>(RandomToTypeNormal<double>(state, stream),
+ RandomToTypeNormal<double>(state, stream));
}
@@ -234,9 +230,7 @@ template <typename T> class NormalRandomGenerator {
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
- uint64_t local_state = m_state + i;
- T result = RandomToTypeNormal<T>(&local_state);
- m_state = local_state;
+ T result = RandomToTypeNormal<T>(&m_state, i);
return result;
}
@@ -244,11 +238,9 @@ template <typename T> class NormalRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
- uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
- values[j] = RandomToTypeNormal<T>(&local_state);
+ values[j] = RandomToTypeNormal<T>(&m_state, i);
}
- m_state = local_state;
return internal::pload<Packet>(values);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e341e2e9b..da0ffe728 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -44,6 +44,7 @@ namespace internal {
typedef typename XprType::Nested Nested;
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
template <class T> struct MakePointer {
// Intermediate typedef to workaround MSVC issue.
@@ -333,7 +334,7 @@ struct OuterReducer {
};
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
@@ -421,7 +422,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
static const bool RunningFullReduction = (NumOutputDims==0);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
+ : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
+#if defined(EIGEN_USE_SYCL)
+ , m_xpr_dims(op.dims())
+#endif
{
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -674,14 +678,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
}
}
- EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- /// added for sycl in order to construct the buffer from the sycl device
- const Device& device() const{return m_device;}
- /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
- const Dims& xprDims() const {return m_xpr_dims;}
+ EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
+#if defined(EIGEN_USE_SYCL)
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ const Device& device() const { return m_device; }
+ const Dims& xprDims() const { return m_xpr_dims; }
+#endif
private:
template <int, typename, typename> friend struct internal::GenericDimReducer;
@@ -691,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
@@ -778,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
Op m_reducer;
// For full reductions
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
@@ -791,7 +794,10 @@ static const bool RunningOnGPU = false;
typename MakePointer_<CoeffReturnType>::Type m_result;
const Device& m_device;
- const Dims& m_xpr_dims;
+
+#if defined(EIGEN_USE_SYCL)
+ const Dims m_xpr_dims;
+#endif
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index edb0ab280..ebcbd6f41 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -14,7 +14,7 @@ namespace Eigen {
namespace internal {
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple cuda thread to safely accumulate at the same
@@ -23,7 +23,7 @@ namespace internal {
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
if (sizeof(T) == 4)
{
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
else {
assert(0 && "Wordsize not supported");
}
-#else
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif
+#endif // EIGEN_CUDA_ARCH >= 300
}
// We extend atomicExch to support extra data types
@@ -98,15 +98,15 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
}
}
}
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
atomicAdd(output, accum);
-#else
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif
+#endif // EIGEN_CUDA_ARCH >= 300
}
@@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
// Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) {
@@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
// Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1);
}
-#else
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif
+#endif // EIGEN_CUDA_ARCH >= 300
}
@@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (gridDim.x == 1 && first_index == 0) {
- if (num_coeffs % 2 != 0) {
- half last = input.m_impl.coeff(num_coeffs-1);
- *scratch = __halves2half2(last, reducer.initialize());
- } else {
- *scratch = reducer.template initializePacket<half2>();
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
}
__syncthreads();
}
@@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+ #else
+ int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
+ reducer.reducePacket(*(half2*)(&temp), &accum);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(scratch, accum, reducer);
}
- __syncthreads();
-
- if (gridDim.x == 1 && first_index == 0) {
- half tmp = __low2half(*scratch);
- reducer.reduce(__high2half(*scratch), &tmp);
- *output = tmp;
+ if (gridDim.x == 1) {
+ __syncthreads();
+ if (first_index == 0) {
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+ }
}
}
@@ -268,7 +280,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
*output = tmp;
}
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
@@ -335,7 +347,7 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
}
}
};
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename Self, typename Op, bool Vectorizable>
@@ -348,11 +360,11 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
+#else // EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -372,7 +384,7 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
@@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
}
}
}
-#else
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif
+#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_CUDA_FP16
@@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+ #else
+ int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
+ int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
+ reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
+ reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
+ #endif
}
half val1 = __low2half(reduced_val1);
@@ -533,7 +556,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
}
}
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
@@ -625,7 +648,7 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
return false;
}
};
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename Self, typename Op>
@@ -638,11 +661,11 @@ struct InnerReducer<Self, Op, GpuDevice> {
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
+#else // EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
+#endif // EIGEN_HAS_CUDA_FP16
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
@@ -740,7 +763,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
}
};
-#endif
+#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__)
} // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c3ca129e2..94899252b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -27,15 +27,15 @@ namespace internal {
template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
* and a second to local memory, used to store intermediate data. */
auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
- auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
+ auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
typedef decltype(aI) InputAccessor;
typedef decltype(aOut) OutputAccessor;
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
@@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
+ h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local));
};
dev.sycl_queue().submit(f);
dev.asynchronousExec();
@@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
- bufOut, bufI, dev, length, local);
+ bufOut, out_offset, bufI, dev, length, local);
}
};
@@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
// getting final out buffer at the moment the created buffer is true because there is no need for assign
auto out_buffer =dev.get_sycl_buffer(output);
+ ptrdiff_t out_offset = dev.get_offset(output);
/// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+ syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize);
}
};
@@ -157,11 +158,12 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
// create a tuple of accessors from Evaluator
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
+ auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
+ ptrdiff_t out_offset = dev.get_offset(output);
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+ (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
});
dev.asynchronousExec();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index e430b0826..14a50a029 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -31,6 +31,7 @@ struct traits<TensorReverseOp<ReverseDimensions,
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename ReverseDimensions, typename XprType>
@@ -222,7 +223,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 8501466ce..1f545ef1a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -24,6 +24,7 @@ struct traits<TensorScanOp<Op, XprType> >
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Op, typename XprType>
@@ -175,7 +176,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const
{
return m_output;
}
@@ -241,7 +242,7 @@ struct ScanLauncher {
}
};
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
@@ -280,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
}
};
-#endif // EIGEN_USE_GPU && __CUDACC__
+#endif // EIGEN_USE_GPU && EIGEN_CUDACC
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index edc9dd3f3..0697fd1ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -31,6 +31,7 @@ struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Shuffle, typename XprType>
@@ -185,7 +186,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
// required by sycl
EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 2237140e7..a7eea99b6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -31,6 +31,7 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
};
template<typename Strides, typename XprType>
@@ -222,7 +223,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 9d5a6d4c1..7b8bd2df7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -32,12 +32,28 @@ struct MakeLocalPointer {
namespace Eigen {
-namespace TensorSycl {
+ template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
+ template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
namespace internal {
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+#ifdef __SYCL_DEVICE_ONLY__
+template<typename A, typename B> struct TypeConversion {
+ template<typename T>
+ static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p);
+ template<typename T>
+ static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p);
+
+ template<typename T>
+ static A* get_address_space_pointer(T* p);
+ typedef decltype(get_address_space_pointer(B())) type;
+};
+#endif
+}
+namespace TensorSycl {
+namespace internal {
+ template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
struct NoOP;
@@ -48,6 +64,13 @@ template<typename T> struct GetType<false, T>{
typedef T Type;
};
+template <bool Conds, size_t X , size_t Y > struct ValueCondition {
+ static constexpr size_t Res =X;
+};
+template<size_t X, size_t Y> struct ValueCondition<false, X, Y> {
+ static constexpr size_t Res =Y;
+};
+
}
}
}
@@ -80,6 +103,9 @@ template<typename T> struct GetType<false, T>{
/// this is used for extracting tensor reduction
#include "TensorReductionSycl.h"
+// TensorArgMaxSycl.h
+#include "TensorArgMaxSycl.h"
+
/// this is used for extracting tensor convolution
#include "TensorConvolutionSycl.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index ee8f3c9c2..d6ac7b91f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -91,27 +91,37 @@ ASSIGNCONVERT(, false)
#undef ASSIGNCONVERT
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is either TensorForcedEvalOp or TensorEvalToOp
+/// type is TensorEvalToOp
#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
template <typename Expr>\
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
: DeviceConvertor<ExprNode, Res, Expr>{};
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
-#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
+
+KERNELBROKERCONVERT(const, true, TensorEvalToOp)
+KERNELBROKERCONVERT(, false, TensorEvalToOp)
+#undef KERNELBROKERCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
+#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
- typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
+struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
+ typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
};
-KERNELBROKERCONVERTFORCEDEVAL(const)
-KERNELBROKERCONVERTFORCEDEVAL()
-#undef KERNELBROKERCONVERTFORCEDEVAL
+// TensorForcedEvalOp
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
-KERNELBROKERCONVERT(const, true, TensorEvalToOp)
-KERNELBROKERCONVERT(, false, TensorEvalToOp)
-#undef KERNELBROKERCONVERT
+// TensorLayoutSwapOp
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
+
+//TensorIndexTupleOp
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
+#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
#define KERNELBROKERCONVERTREDUCTION(CVQual)\
@@ -124,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const)
KERNELBROKERCONVERTREDUCTION()
#undef KERNELBROKERCONVERTREDUCTION
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
+template <typename OP, typename Dim, typename subExpr>\
+struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
+ typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
+};
+
+KERNELBROKERCONVERTTUPLEREDUCTION(const)
+KERNELBROKERCONVERTTUPLEREDUCTION()
+#undef KERNELBROKERCONVERTTUPLEREDUCTION
+
+//TensorSlicingOp
#define KERNELBROKERCONVERTSLICEOP(CVQual)\
template<typename StartIndices, typename Sizes, typename XprType>\
struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
@@ -134,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const)
KERNELBROKERCONVERTSLICEOP()
#undef KERNELBROKERCONVERTSLICEOP
-
+//TensorStridingSlicingOp
#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
@@ -145,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const)
KERNELBROKERCONVERTERSLICESTRIDEOP()
#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
template <DenseIndex DimId, typename Expr>\
@@ -156,7 +177,26 @@ KERNELBROKERCONVERTCHIPPINGOP(const)
KERNELBROKERCONVERTCHIPPINGOP()
#undef KERNELBROKERCONVERTCHIPPINGOP
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
+#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
+struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\
+ typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
+};
+KERNELBROKERCONVERTIMAGEPATCHOP(const)
+KERNELBROKERCONVERTIMAGEPATCHOP()
+#undef KERNELBROKERCONVERTIMAGEPATCHOP
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp
+#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\
+template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
+struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\
+ typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
+};
+KERNELBROKERCONVERTVOLUMEPATCHOP(const)
+KERNELBROKERCONVERTVOLUMEPATCHOP()
+#undef KERNELBROKERCONVERTVOLUMEPATCHOP
} // namespace internal
} // namespace TensorSycl
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index 3b83b1d2c..cbae4ea1d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -65,7 +65,6 @@ CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
: expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
};
-
TENSORMAP(const)
TENSORMAP()
#undef TENSORMAP
@@ -83,6 +82,7 @@ CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Option
ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
: expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
};
+
TENSORMAPFIXEDSIZE(const)
TENSORMAPFIXEDSIZE()
#undef TENSORMAPFIXEDSIZE
@@ -189,9 +189,6 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
ASSIGN()
#undef ASSIGN
-
-
-
/// specialisation of the \ref ExprConstructor struct when the node type is
/// const TensorAssignOp
#define CONVERSIONEXPRCONST(CVQual)\
@@ -223,7 +220,7 @@ struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQua
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
+ : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
};
EVALTO(const)
@@ -236,8 +233,12 @@ EVALTO()
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
- TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+ typedef TensorForcedEvalOp<OrigExpr> XprType;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+ Eigen::internal::traits<XprType>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@@ -248,19 +249,32 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
-template <bool Conds, size_t X , size_t Y > struct ValueCondition {
- static const size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
- static const size_t Res =Y;
+#define TENSORCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
+CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
+ typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+ Eigen::internal::traits<XprType>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
};
+TENSORCUSTOMUNARYOP(const)
+TENSORCUSTOMUNARYOP()
+#undef TENSORCUSTOMUNARYOP
+
/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
#define SYCLREDUCTIONEXPR(CVQual)\
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
- static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+ static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
Type expr;\
@@ -273,32 +287,67 @@ SYCLREDUCTIONEXPR(const)
SYCLREDUCTIONEXPR()
#undef SYCLREDUCTIONEXPR
+/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
+/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
+#define SYCLTUPLEREDUCTIONEXPR(CVQual)\
+template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
+CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
+ static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
+ static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
+static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
+ typedef CVQual TensorMap<\
+ Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
+ Layout,\
+ MakeGlobalPointer\
+ > XprType;\
+ typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
+ static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
+ typedef array<Index, NumDims> StrideDims;\
+ typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+ :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
+ fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
+ }\
+};
+
+SYCLTUPLEREDUCTIONEXPR(const)
+SYCLTUPLEREDUCTIONEXPR()
+#undef SYCLTUPLEREDUCTIONEXPR
/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
+#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
- static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
- typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
- NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
- typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
- Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+ typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
+ static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
+ typedef CVQual TensorMap<\
+ Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
+ Eigen::internal::traits<XprTyp>::Layout, \
+ MakeGlobalPointer\
+ > Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
};
-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
-
-
+//TensorContractionOp
+SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
+//TensorConvolutionOp
+SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
+//TensorCustomBinaryOp
+SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
+#undef SYCLCONTRACTCONVCUSBIOPS
+
+//TensorSlicingOp
#define SYCLSLICEOPEXPR(CVQual)\
template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
@@ -315,7 +364,7 @@ SYCLSLICEOPEXPR(const)
SYCLSLICEOPEXPR()
#undef SYCLSLICEOPEXPR
-
+//TensorStridingSlicingOp
#define SYCLSLICESTRIDEOPEXPR(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
@@ -332,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const)
SYCLSLICESTRIDEOPEXPR()
#undef SYCLSLICESTRIDEOPEXPR
+//TensorReshapingOp and TensorShufflingOp
#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@@ -344,13 +394,15 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param
: xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
};
+// TensorReshapingOp
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-
+// TensorShufflingOp
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
+//TensorPaddingOp
#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@@ -363,11 +415,11 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param
: xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
};
+//TensorPaddingOp
SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
#undef SYCLPADDINGOPEXPRCONST
-
// TensorChippingOp
#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
@@ -385,6 +437,67 @@ SYCLTENSORCHIPPINGOPEXPR(const)
SYCLTENSORCHIPPINGOPEXPR()
#undef SYCLTENSORCHIPPINGOPEXPR
+// TensorImagePatchOp
+#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
+struct ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\
+ typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
+ typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\
+ my_xpr_type xprExpr;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\
+ funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \
+ funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\
+};
+
+SYCLTENSORIMAGEPATCHOPEXPR(const)
+SYCLTENSORIMAGEPATCHOPEXPR()
+#undef SYCLTENSORIMAGEPATCHOPEXPR
+
+// TensorVolumePatchOp
+#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
+struct ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\
+ typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
+ typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\
+ my_xpr_type xprExpr;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\
+ funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \
+ funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \
+ funcD.m_padding_type, funcD.m_padding_value ){\
+ }\
+};
+
+SYCLTENSORVOLUMEPATCHOPEXPR(const)
+SYCLTENSORVOLUMEPATCHOPEXPR()
+#undef SYCLTENSORVOLUMEPATCHOPEXPR
+
+// TensorLayoutSwapOp and TensorIndexTupleOp
+#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
+template<typename OrigXprType, typename XprType, typename... Params>\
+struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
+ typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
+ typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
+ my_xpr_type xprExpr;\
+ Type expr;\
+ template <typename FuncDetector>\
+ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
+};
+
+//TensorLayoutSwapOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
+//TensorIndexTupleOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
+
+#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
/// template deduction for \ref ExprConstructor struct
template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index b512d43f6..fb95af59e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -147,6 +147,30 @@ SYCLFORCEDEVALEXTACC(const)
SYCLFORCEDEVALEXTACC()
#undef SYCLFORCEDEVALEXTACC
+//TensorCustomUnaryOp
+#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, typename Dev >\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+
+SYCLCUSTOMUNARYOPEXTACC(const)
+SYCLCUSTOMUNARYOPEXTACC()
+#undef SYCLCUSTOMUNARYOPEXTACC
+
+//TensorCustomBinaryOp
+#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+SYCLCUSTOMBINARYOPEXTACC(const)
+SYCLCUSTOMBINARYOPEXTACC()
+#undef SYCLCUSTOMBIBARYOPEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
#define SYCLEVALTOEXTACC(CVQual)\
@@ -161,15 +185,19 @@ SYCLEVALTOEXTACC()
#undef SYCLEVALTOEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual)\
+#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
+struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
+// TensorReductionOp
+SYCLREDUCTIONEXTACC(const,TensorReductionOp)
+SYCLREDUCTIONEXTACC(,TensorReductionOp)
-SYCLREDUCTIONEXTACC(const)
-SYCLREDUCTIONEXTACC()
+// TensorTupleReducerOp
+SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
+SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
#undef SYCLREDUCTIONEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
@@ -179,14 +207,14 @@ template<typename Indices, typename LhsXprType, typename RhsXprType, typename De
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
};
-
+//TensorContractionOp
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
+//TensorConvolutionOp
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
/// specialisation of the \ref ExtractAccessor struct when the node type is
/// const TensorSlicingOp.
#define SYCLSLICEOPEXTACC(CVQual)\
@@ -225,6 +253,49 @@ SYCLTENSORCHIPPINGOPEXTACC(const)
SYCLTENSORCHIPPINGOPEXTACC()
#undef SYCLTENSORCHIPPINGOPEXTACC
+// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorImagePatchOp.
+#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+};
+
+SYCLTENSORIMAGEPATCHOPEXTACC(const)
+SYCLTENSORIMAGEPATCHOPEXTACC()
+#undef SYCLTENSORIMAGEPATCHOPEXTACC
+
+// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorVolumePatchOp.
+#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+};
+
+SYCLTENSORVOLUMEPATCHOPEXTACC(const)
+SYCLTENSORVOLUMEPATCHOPEXTACC()
+#undef SYCLTENSORVOLUMEPATCHOPEXTACC
+
+// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorLayoutSwapOp, TensorIndexTupleOp
+#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
+template<typename XprType, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
+ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+};
+
+// TensorLayoutSwapOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
+//TensorIndexTupleOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
+
+#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
/// template deduction for \ref ExtractAccessor
template <typename Evaluator>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index ee020184b..a7905706d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -33,15 +33,17 @@ namespace internal {
/// re-instantiate them on the device.
/// We have to pass instantiated functors to the device.
// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
-template <typename Evaluator> struct FunctorExtractor{
- typedef typename Evaluator::Dimensions Dimensions;
- const Dimensions m_dimensions;
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- FunctorExtractor(const Evaluator& expr)
- : m_dimensions(expr.dimensions()) {}
+#define DEFALTACTION(Evaluator)\
+typedef typename Evaluator::Dimensions Dimensions;\
+const Dimensions m_dimensions;\
+EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
+template <typename Evaluator> struct FunctorExtractor{
+ DEFALTACTION(Evaluator)
};
+
/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
///TensorConversionOp
#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
@@ -113,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const)
SYCLEXTRFUNCTERNARY()
#undef SYCLEXTRFUNCTERNARY
+
+
+//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
+//from the UnaryCategory and it is similar to the general FunctorExtractor.
+/// specialisation of TensorCustomOp
+#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
+ typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
+ DEFALTACTION(Evaluator)\
+};
+//TensorCustomUnaryOp
+SYCLEXTRFUNCCUSTOMUNARYOP(const)
+SYCLEXTRFUNCCUSTOMUNARYOP()
+#undef SYCLEXTRFUNCCUSTOMUNARYOP
+
+//TensorCustomBinaryOp
+#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
+template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
+ typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
+ DEFALTACTION(Evaluator)\
+};
+//TensorCustomBinaryOp
+SYCLEXTRFUNCCUSTOMBIBARYOP(const)
+SYCLEXTRFUNCCUSTOMBIBARYOP()
+#undef SYCLEXTRFUNCCUSTOMBIBARYOP
+
+
+
/// specialisation of the \ref FunctorExtractor struct when the node type is
/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
#define SYCLEXTRFUNCSELECTOP(CVQual)\
@@ -143,19 +175,26 @@ SYCLEXTRFUNCASSIGNOP(const)
SYCLEXTRFUNCASSIGNOP()
#undef SYCLEXTRFUNCASSIGNOP
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOP(CVQual)\
-template <typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node types are
+/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
+#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
+template <typename Expr, typename Dev>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
+ FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
+ FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\
+ : xprExpr(expr.impl()) {}\
};
-
-SYCLEXTRFUNCEVALTOOP(const)
-SYCLEXTRFUNCEVALTOOP()
-#undef SYCLEXTRFUNCEVALTOOP
+//TensorEvalToOp
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
+// TensorLayoutSwapOp
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
+// TensorIndexTupleOp
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
+
+#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
template<typename Dim, size_t NumOutputDim> struct DimConstr {
template<typename InDim>
@@ -166,10 +205,10 @@ template<typename Dim> struct DimConstr<Dim, 0> {
template<typename InDim>
static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
};
-
+//TensorReductionOp
#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
const Dimensions m_dimensions;\
@@ -177,12 +216,39 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgTy
FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
: m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
};
-
-
SYCLEXTRFUNCREDUCTIONOP(const)
SYCLEXTRFUNCREDUCTIONOP()
#undef SYCLEXTRFUNCREDUCTIONOP
+//TensorTupleReducerOp
+#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
+template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
+ struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
+ typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
+ static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
+ typedef typename Evaluator::StrideDims StrideDims;\
+ typedef typename Evaluator::Index Index;\
+ typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
+ const Dimensions m_dimensions;\
+ const Index m_return_dim;\
+ const StrideDims m_strides;\
+ const Index m_stride_mod;\
+ const Index m_stride_div;\
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+ EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\
+ EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\
+ EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\
+ EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\
+ FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
+ : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
+ m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
+};
+
+SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
+SYCLEXTRFUNCTUPLEREDUCTIONOP()
+#undef SYCLEXTRFUNCTUPLEREDUCTIONOP
+
+//TensorContractionOp and TensorConvolutionOp
#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
@@ -194,9 +260,10 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, Rhs
: m_dimensions(expr.dimensions()) {}\
};
-
+//TensorContractionOp
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
+//TensorConvolutionOp
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
@@ -219,6 +286,7 @@ SYCLEXTRFUNCTSLICEOP(const)
SYCLEXTRFUNCTSLICEOP()
#undef SYCLEXTRFUNCTSLICEOP
+//TensorStridingSlicingOp
#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
@@ -237,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const)
SYCLEXTRFUNCTSLICESTRIDEOP()
#undef SYCLEXTRFUNCTSLICESTRIDEOP
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
+// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
template<typename Param, typename XprType, typename Dev>\
struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
@@ -248,9 +316,11 @@ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprTy
: xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
};
+//TensorReshapingOp
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
+//TensorShufflingOp
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
@@ -293,7 +363,7 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
//TensorChippingOp
#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\
FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
const DenseIndex m_dim;\
const DenseIndex m_offset;\
@@ -307,6 +377,84 @@ SYCLEXTRFUNCCHIPPINGOP(const)
SYCLEXTRFUNCCHIPPINGOP()
#undef SYCLEXTRFUNCCHIPPINGOP
+//TensorImagePatchOp
+#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
+typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\
+FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
+const DenseIndex m_patch_rows;\
+const DenseIndex m_patch_cols;\
+const DenseIndex m_row_strides;\
+const DenseIndex m_col_strides;\
+const DenseIndex m_in_row_strides;\
+const DenseIndex m_in_col_strides;\
+const DenseIndex m_row_inflate_strides;\
+const DenseIndex m_col_inflate_strides;\
+const bool m_padding_explicit;\
+const DenseIndex m_padding_top;\
+const DenseIndex m_padding_bottom;\
+const DenseIndex m_padding_left;\
+const DenseIndex m_padding_right;\
+const PaddingType m_padding_type;\
+const typename Self::Scalar m_padding_value;\
+FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
+: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
+ m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
+ m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
+ m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\
+ m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\
+ m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
+ m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\
+ m_padding_value(expr.xpr().padding_value()){}\
+};
+
+SYCLEXTRFUNCIMAGEPATCHOP(const)
+SYCLEXTRFUNCIMAGEPATCHOP()
+#undef SYCLEXTRFUNCIMAGEPATCHOP
+
+/// TensorVolumePatchOp
+#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\
+typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\
+FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
+const DenseIndex m_patch_planes;\
+const DenseIndex m_patch_rows;\
+const DenseIndex m_patch_cols;\
+const DenseIndex m_plane_strides;\
+const DenseIndex m_row_strides;\
+const DenseIndex m_col_strides;\
+const DenseIndex m_in_plane_strides;\
+const DenseIndex m_in_row_strides;\
+const DenseIndex m_in_col_strides;\
+const DenseIndex m_plane_inflate_strides;\
+const DenseIndex m_row_inflate_strides;\
+const DenseIndex m_col_inflate_strides;\
+const bool m_padding_explicit;\
+const DenseIndex m_padding_top_z;\
+const DenseIndex m_padding_bottom_z;\
+const DenseIndex m_padding_top;\
+const DenseIndex m_padding_bottom;\
+const DenseIndex m_padding_left;\
+const DenseIndex m_padding_right;\
+const PaddingType m_padding_type;\
+const typename Self::Scalar m_padding_value;\
+FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
+: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
+ m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
+ m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
+ m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\
+ m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\
+ m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \
+ m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
+ m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\
+};
+SYCLEXTRFUNCVOLUMEPATCHOP(const)
+SYCLEXTRFUNCVOLUMEPATCHOP()
+#undef SYCLEXTRFUNCVOLUMEPATCHOP
+
+
/// template deduction function for FunctorExtractor
template <typename Evaluator>
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 2f7779036..e5b892f2e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -21,11 +21,12 @@ namespace internal {
template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
OP op;
OutputAccessor aOut;
+ ptrdiff_t out_offset;
InputAccessor aI;
LocalAccessor scratch;
size_t length, local;
- GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+ GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+ : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
void operator()(cl::sycl::nd_item<1> itemID) {
size_t globalid = itemID.get_global(0);
size_t localid = itemID.get_local(0);
@@ -59,7 +60,7 @@ namespace internal {
aI[itemID.get_group(0)] = scratch[localid];
if((length<=local) && globalid ==0){
auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
- aOutPtr[0]=scratch[0];
+ aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
}
}
}
@@ -71,9 +72,9 @@ namespace internal {
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
+ ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
+ :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@@ -84,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
- output_accessor_ptr[globalid]= accum;
+ output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
}
}
private:
write_accessor output_accessor;
+ ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;
@@ -109,11 +111,11 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+ typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
+ ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
+ :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@@ -124,8 +126,8 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
functor.finalize(accum);
- output_accessor_ptr[globalid]= accum/num_values_to_reduce;
+ output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
}
}
private:
write_accessor output_accessor;
+ ptrdiff_t out_offset;
FunctorExpr functors;
Tuple_of_Acc tuple_of_accessors;
Dims dims;
@@ -170,7 +173,7 @@ public:
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id();
@@ -217,7 +220,7 @@ public:
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
/// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id();
auto scale = (rng*red_factor) + remaining;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index a1c112f4d..234580c7c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -93,26 +93,58 @@ SYCLFORCEDEVALLEAFCOUNT(const)
SYCLFORCEDEVALLEAFCOUNT()
#undef SYCLFORCEDEVALLEAFCOUNT
+#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
+template <typename CustomUnaryFunc, typename XprType>\
+struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
+static const size_t Count =1;\
+};
+
+SYCLCUSTOMUNARYOPLEAFCOUNT(const)
+SYCLCUSTOMUNARYOPLEAFCOUNT()
+#undef SYCLCUSTOMUNARYOPLEAFCOUNT
+
+
+#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
+static const size_t Count =1;\
+};
+SYCLCUSTOMBINARYOPLEAFCOUNT( const)
+SYCLCUSTOMBINARYOPLEAFCOUNT()
+#undef SYCLCUSTOMBINARYOPLEAFCOUNT
+
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLEAFCOUNT(CVQual)\
+#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
template <typename Expr>\
-struct LeafCount<CVQual TensorEvalToOp<Expr> > {\
- static const size_t Count = 1 + CategoryCount<Expr>::Count;\
+struct LeafCount<CVQual ExprNode<Expr> > {\
+ static const size_t Count = Num + CategoryCount<Expr>::Count;\
};
-EVALTOLEAFCOUNT(const)
-EVALTOLEAFCOUNT()
-#undef EVALTOLEAFCOUNT
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
+
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
+
+#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual)\
+#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\
+struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
static const size_t Count =1;\
};
-REDUCTIONLEAFCOUNT(const)
-REDUCTIONLEAFCOUNT()
+// TensorReductionOp
+REDUCTIONLEAFCOUNT(const,TensorReductionOp)
+REDUCTIONLEAFCOUNT(,TensorReductionOp)
+
+// tensor Argmax -TensorTupleReducerOp
+REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
+REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
+
#undef REDUCTIONLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
@@ -128,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-
/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
#define SLICEOPLEAFCOUNT(CVQual)\
template <typename StartIndices, typename Sizes, typename XprType>\
@@ -139,7 +169,6 @@ SLICEOPLEAFCOUNT(const)
SLICEOPLEAFCOUNT()
#undef SLICEOPLEAFCOUNT
-
/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp
#define CHIPPINGOPLEAFCOUNT(CVQual)\
template <DenseIndex DimId, typename XprType>\
@@ -149,7 +178,7 @@ CHIPPINGOPLEAFCOUNT(const)
CHIPPINGOPLEAFCOUNT()
#undef CHIPPINGOPLEAFCOUNT
-
+///TensorStridingSlicingOp
#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
@@ -158,6 +187,24 @@ SLICESTRIDEOPLEAFCOUNT(const)
SLICESTRIDEOPLEAFCOUNT()
#undef SLICESTRIDEOPLEAFCOUNT
+//TensorImagePatchOp
+#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
+struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{};
+
+
+TENSORIMAGEPATCHOPLEAFCOUNT(const)
+TENSORIMAGEPATCHOPLEAFCOUNT()
+#undef TENSORIMAGEPATCHOPLEAFCOUNT
+
+// TensorVolumePatchOp
+#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
+struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
+
+TENSORVOLUMEPATCHOPLEAFCOUNT(const)
+TENSORVOLUMEPATCHOPLEAFCOUNT()
+#undef TENSORVOLUMEPATCHOPLEAFCOUNT
} /// namespace TensorSycl
} /// namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 74566dcee..9d5708fc5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -143,17 +143,52 @@ FORCEDEVAL(const)
FORCEDEVAL()
#undef FORCEDEVAL
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define CUSTOMUNARYOPEVAL(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
+};
+
+CUSTOMUNARYOPEVAL(const)
+CUSTOMUNARYOPEVAL()
+#undef CUSTOMUNARYOPEVAL
+
+
/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorEvalToOp
-#define EVALTO(CVQual)\
+/// TensorForcedEvalOp
+#define CUSTOMBINARYOPEVAL(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
+};
+
+CUSTOMBINARYOPEVAL(const)
+CUSTOMBINARYOPEVAL()
+#undef CUSTOMBINARYOPEVAL
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
+#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
- typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
+struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
+ typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
};
-EVALTO(const)
-EVALTO()
-#undef EVALTO
+// TensorEvalToOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
+//TensorLayoutSwapOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
+//TensorIndexTupleOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
+
+#undef EVALTOLAYOUTSWAPINDEXTUPLE
/// specialisation of the \ref PlaceHolderExpression when the node is
@@ -169,17 +204,24 @@ CHIPPINGOP()
#undef CHIPPINGOP
/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLREDUCTION(CVQual)\
+/// TensorReductionOp and TensorTupleReducerOp (Argmax)
+#define SYCLREDUCTION(CVQual, ExprNode)\
template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
- typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
+struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
+ typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
};
-SYCLREDUCTION(const)
-SYCLREDUCTION()
+
+// tensor reduction
+SYCLREDUCTION(const, TensorReductionOp)
+SYCLREDUCTION(, TensorReductionOp)
+
+// tensor Argmax -TensorTupleReducerOp
+SYCLREDUCTION(const, TensorTupleReducerOp)
+SYCLREDUCTION(, TensorTupleReducerOp)
#undef SYCLREDUCTION
+
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorReductionOp
#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
@@ -218,6 +260,34 @@ SYCLSLICESTRIDEOPPLH()
#undef SYCLSLICESTRIDEOPPLH
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorImagePatchOp
+#define SYCLTENSORIMAGEPATCHOP(CVQual)\
+template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\
+ typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
+};
+
+SYCLTENSORIMAGEPATCHOP(const)
+SYCLTENSORIMAGEPATCHOP()
+#undef SYCLTENSORIMAGEPATCHOP
+
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorVolumePatchOp
+#define SYCLTENSORVOLUMEPATCHOP(CVQual)\
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\
+ typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
+};
+
+SYCLTENSORVOLUMEPATCHOP(const)
+SYCLTENSORVOLUMEPATCHOP()
+#undef SYCLTENSORVOLUMEPATCHOP
+
+
/// template deduction for \ref PlaceHolderExpression struct
template <typename Expr>
struct createPlaceHolderExpression {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index cac785540..29c78184d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -25,7 +25,6 @@
namespace Eigen {
namespace TensorSycl {
-
template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
@@ -38,7 +37,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
if (gId < range)
device_evaluator.evalScalar(gId);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 000000000..2b1968de1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+namespace Eigen {
+
+/** \class TensorTrace
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor Trace class.
+ *
+ *
+ */
+
+namespace internal {
+template<typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
+{
+ typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template<typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
+{
+ typedef TensorTraceOp<Dims, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+ : m_xpr(expr), m_dims(dims) {
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dims& dims() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Dims m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
+{
+ typedef TensorTraceOp<Dims, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumReducedDims = internal::array_size<Dims>::value;
+ static const int NumOutputDims = NumInputDims - NumReducedDims;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumOutputDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
+ {
+
+ EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ for (int i = 0; i < NumInputDims; ++i) {
+ m_reduced[i] = false;
+ }
+
+ const Dims& op_dims = op.dims();
+ for (int i = 0; i < NumReducedDims; ++i) {
+ eigen_assert(op_dims[i] >= 0);
+ eigen_assert(op_dims[i] < NumInputDims);
+ m_reduced[op_dims[i]] = true;
+ }
+
+ // All the dimensions should be distinct to compute the trace
+ int num_distinct_reduce_dims = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ ++num_distinct_reduce_dims;
+ }
+ }
+
+ eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+ // Compute the dimensions of the result.
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ int output_index = 0;
+ int reduced_index = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ m_reducedDims[reduced_index] = input_dims[i];
+ if (reduced_index > 0) {
+ // All the trace dimensions must have the same size
+ eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+ }
+ ++reduced_index;
+ }
+ else {
+ m_dimensions[output_index] = input_dims[i];
+ ++output_index;
+ }
+ }
+
+ if (NumReducedDims != 0) {
+ m_traceDim = m_reducedDims[0];
+ }
+
+ // Compute the output strides
+ if (NumOutputDims > 0) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumOutputDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ }
+ }
+ else {
+ m_outputStrides.back() = 1;
+ for (int i = NumOutputDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ }
+ }
+ }
+
+ // Compute the input strides
+ if (NumInputDims > 0) {
+ array<Index, NumInputDims> input_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_strides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+ }
+ }
+ else {
+ input_strides.back() = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ output_index = 0;
+ reduced_index = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if(m_reduced[i]) {
+ m_reducedStrides[reduced_index] = input_strides[i];
+ ++reduced_index;
+ }
+ else {
+ m_preservedStrides[output_index] = input_strides[i];
+ ++output_index;
+ }
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_dimensions;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Initialize the result
+ CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+ Index index_stride = 0;
+ for (int i = 0; i < NumReducedDims; ++i) {
+ index_stride += m_reducedStrides[i];
+ }
+
+ // If trace is requested along all dimensions, starting index would be 0
+ Index cur_index = 0;
+ if (NumOutputDims != 0)
+ cur_index = firstInput(index);
+ for (Index i = 0; i < m_traceDim; ++i) {
+ result += m_impl.coeff(cur_index);
+ cur_index += index_stride;
+ }
+
+ return result;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+ return result;
+ }
+
+ protected:
+ // Given the output index, finds the first index in the input tensor used to compute the trace
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[0];
+ }
+ else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[NumOutputDims - 1];
+ }
+ return startInput;
+ }
+
+ Dimensions m_dimensions;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ array<bool, NumInputDims> m_reduced;
+ array<Index, NumReducedDims> m_reducedDims;
+ // Initialize the size of the trace dimension
+ Index m_traceDim;
+ array<Index, NumOutputDims> m_outputStrides;
+ array<Index, NumReducedDims> m_reducedStrides;
+ array<Index, NumOutputDims> m_preservedStrides;
+};
+
+
+} // End namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index a1e944e59..006b37921 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -61,6 +61,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
typedef T& RefType;
};
+ typedef typename MakePointer<Scalar>::Type PointerType;
};
@@ -81,6 +82,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
typedef T& RefType;
};
+ typedef typename MakePointer<Scalar>::Type PointerType;
};
@@ -105,6 +107,7 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
typedef typename MakePointerT::RefType RefType;
};
+ typedef typename MakePointer<Scalar>::Type PointerType;
};
template<typename PlainObjectType>
@@ -121,6 +124,7 @@ struct traits<TensorRef<PlainObjectType> >
Options = BaseTraits::Options,
Flags = BaseTraits::Flags
};
+ typedef typename BaseTraits::PointerType PointerType;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 0ca2cac84..51c099591 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -22,6 +22,7 @@ namespace Eigen {
* dimensions.
*/
namespace internal {
+
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
{
@@ -33,6 +34,8 @@ struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+
};
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -65,12 +68,12 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
PaddingType padding_type, Scalar padding_value)
- : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
- m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
- m_padding_type(padding_type), m_padding_value(padding_value) {}
+ : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+ m_padding_type(padding_type), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
@@ -80,13 +83,31 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
DenseIndex padding_top, DenseIndex padding_bottom,
DenseIndex padding_left, DenseIndex padding_right,
Scalar padding_value)
- : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
- m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
- m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
- m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
- m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
- m_padding_left(padding_left), m_padding_right(padding_right),
- m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+ : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+ m_padding_left(padding_left), m_padding_right(padding_right),
+ m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ bool padding_explicit, DenseIndex padding_top_z, DenseIndex padding_bottom_z,
+ DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left,
+ DenseIndex padding_right, PaddingType padding_type, Scalar padding_value)
+ : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides),
+ m_col_inflate_strides(col_inflate_strides), m_padding_explicit(padding_explicit), m_padding_top_z(padding_top_z),
+ m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), m_padding_left(padding_left),
+ m_padding_right(padding_right), m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+#endif
EIGEN_DEVICE_FUNC
DenseIndex patch_planes() const { return m_patch_planes; }
@@ -183,9 +204,16 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
CoordAccess = false,
RawAccess = false
};
+#ifdef __SYCL_DEVICE_ONLY__
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
+#else
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+#endif
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
+#ifdef EIGEN_USE_SYCL
+ , m_op(op)
+#endif
{
EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -322,6 +350,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
// Fast representations of different variables.
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
@@ -338,7 +367,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
}
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -502,10 +530,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+ // Required by SYCL in order to construct the expression on the device
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
+#endif
+
Index planePaddingTop() const { return m_planePaddingTop; }
Index rowPaddingTop() const { return m_rowPaddingTop; }
Index colPaddingLeft() const { return m_colPaddingLeft; }
@@ -600,6 +633,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
Scalar m_paddingValue;
TensorEvaluator<ArgType, Device> m_impl;
+
+#ifdef EIGEN_USE_SYCL
+// Required by SYCL in order to construct the expression on the device
+ XprType m_op;
+#endif
+
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 9dcc9dab7..1264a0270 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -24,9 +24,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
Environment env = Environment())
- : num_threads_(num_threads),
+ : env_(env),
+ num_threads_(num_threads),
allow_spinning_(allow_spinning),
- env_(env),
threads_(num_threads),
queues_(num_threads),
coprimes_(num_threads),
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 573ca435a..96b3a8261 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index d2808860c..279fe5cd3 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -683,4 +683,11 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
}
+namespace std {
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T> >
+ : public numeric_limits<typename T::Scalar> {};
+
+} // namespace std
+
#endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
index a5d034d71..e43cdb7fb 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -341,7 +341,7 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
// set from a vector of Euler angles
template<class System, class Other>
- struct eulerangles_assign_impl<System,Other,4,1>
+ struct eulerangles_assign_impl<System,Other,3,1>
{
typedef typename Other::Scalar Scalar;
static void run(EulerAngles<Scalar, System>& e, const Other& vec)
diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index d113e6e90..9b3eb53e0 100644
--- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -104,12 +104,18 @@ class IterScaling
for (int i = 0; i < m; ++i)
{
Dr(i) = std::sqrt(Dr(i));
+ }
+ for (int i = 0; i < n; ++i)
+ {
Dc(i) = std::sqrt(Dc(i));
}
// Save the scaling factors
for (int i = 0; i < m; ++i)
{
m_left(i) /= Dr(i);
+ }
+ for (int i = 0; i < n; ++i)
+ {
m_right(i) /= Dc(i);
}
// Scale the rows and the columns of the matrix
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index bb6d9e1fe..85ab3d97c 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -326,6 +326,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
} else if (l1norm < 1.125358383453143065081397882891878e+000L) {
matrix_exp_pade13(arg, U, V);
} else {
+ const long double maxnorm = 2.884233277829519311757165057717815L;
frexp(l1norm / maxnorm, &squarings);
if (squarings < 0) squarings = 0;
MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
@@ -342,6 +343,27 @@ struct matrix_exp_computeUV<MatrixType, long double>
}
};
+template<typename T> struct is_exp_known_type : false_type {};
+template<> struct is_exp_known_type<float> : true_type {};
+template<> struct is_exp_known_type<double> : true_type {};
+#if LDBL_MANT_DIG <= 112
+template<> struct is_exp_known_type<long double> : true_type {};
+#endif
+
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type
+{
+ typedef typename ArgType::PlainObject MatrixType;
+ MatrixType U, V;
+ int squarings;
+ matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
+ MatrixType numer = U + V;
+ MatrixType denom = -U + V;
+ result = denom.partialPivLu().solve(numer);
+ for (int i=0; i<squarings; i++)
+ result *= result; // undo scaling by repeated squaring
+}
+
/* Computes the matrix exponential
*
@@ -349,26 +371,13 @@ struct matrix_exp_computeUV<MatrixType, long double>
* \param result variable in which result will be stored
*/
template <typename ArgType, typename ResultType>
-void matrix_exp_compute(const ArgType& arg, ResultType &result)
+void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default
{
typedef typename ArgType::PlainObject MatrixType;
-#if LDBL_MANT_DIG > 112 // rarely happens
typedef typename traits<MatrixType>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename std::complex<RealScalar> ComplexScalar;
- if (sizeof(RealScalar) > 14) {
- result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
- return;
- }
-#endif
- MatrixType U, V;
- int squarings;
- matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
- MatrixType numer = U + V;
- MatrixType denom = -U + V;
- result = denom.partialPivLu().solve(numer);
- for (int i=0; i<squarings; i++)
- result *= result; // undo scaling by repeated squaring
+ result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
}
} // end namespace Eigen::internal
@@ -402,7 +411,7 @@ template<typename Derived> struct MatrixExponentialReturnValue
inline void evalTo(ResultType& result) const
{
const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
- internal::matrix_exp_compute(tmp, result);
+ internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::Scalar>());
}
Index rows() const { return m_src.rows(); }
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index db2449d02..ef50c46a9 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -398,8 +398,8 @@ struct matrix_function_compute
template <typename MatrixType>
struct matrix_function_compute<MatrixType, 0>
{
- template <typename AtomicType, typename ResultType>
- static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
{
typedef internal::traits<MatrixType> Traits;
typedef typename Traits::Scalar Scalar;
@@ -422,14 +422,14 @@ struct matrix_function_compute<MatrixType, 0>
template <typename MatrixType>
struct matrix_function_compute<MatrixType, 1>
{
- template <typename AtomicType, typename ResultType>
- static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
{
typedef internal::traits<MatrixType> Traits;
- typedef typename MatrixType::Index Index;
// compute Schur decomposition of A
- const ComplexSchur<MatrixType> schurOfA(A);
+ const ComplexSchur<MatrixType> schurOfA(A);
+ eigen_assert(schurOfA.info()==Success);
MatrixType T = schurOfA.matrixT();
MatrixType U = schurOfA.matrixU();
@@ -514,7 +514,7 @@ template<typename Derived> class MatrixFunctionReturnValue
typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
AtomicType atomic(m_f);
- internal::matrix_function_compute<NestedEvalTypeClean>::run(m_A, atomic, result);
+ internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
}
Index rows() const { return m_A.rows(); }
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 1acfbed9e..ff8f6e732 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -339,7 +339,7 @@ public:
typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
AtomicType atomic;
- internal::matrix_function_compute<DerivedEvalTypeClean>::run(m_A, atomic, result);
+ internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
}
Index rows() const { return m_A.rows(); }
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 369ad97b4..5d1b8fcc2 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -121,7 +121,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgammaf_r(x, &dummy);
#else
@@ -134,7 +134,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgamma_r(x, &dummy);
#else
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
index ec4fa8448..e0e3a8be6 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plgamma<float4>(const float4& a)
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 003c9de0b..22647cadd 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -152,25 +152,40 @@ endif()
if(EIGEN_TEST_CXX11)
if(EIGEN_TEST_SYCL)
- ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11")
+ if(EIGEN_SYCL_TRISYCL)
+ set(CMAKE_CXX_STANDARD 14)
+ set(STD_CXX_FLAG "-std=c++1z")
+ else(EIGEN_SYCL_TRISYCL)
+ # It should be safe to always run these tests as there is some fallback code for
+ # older compiler that don't support cxx11.
+ set(CMAKE_CXX_STANDARD 11)
+ set(STD_CXX_FLAG "-std=c++11")
+ endif(EIGEN_SYCL_TRISYCL)
+
+ ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+ ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
endif(EIGEN_TEST_SYCL)
- # It should be safe to always run these tests as there is some fallback code for
- # older compiler that don't support cxx11.
- set(CMAKE_CXX_STANDARD 11)
ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
@@ -212,6 +227,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_fft)
ei_add_test(cxx11_tensor_ifft)
ei_add_test(cxx11_tensor_scan)
+ ei_add_test(cxx11_tensor_trace)
endif()
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
index 79ee72847..500fb2d17 100644
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@@ -278,6 +278,9 @@ void test_EulerAngles()
EulerAnglesXYZd onesEd(1, 1, 1);
EulerAnglesXYZf onesEf = onesEd.cast<float>();
VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+ // Simple Construction from Vector3 test
+ VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
CALL_SUBTEST_1( eulerangles_manual<float>() );
CALL_SUBTEST_2( eulerangles_manual<double>() );
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index 4df2f5c57..9cf11280c 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -72,6 +72,20 @@ template<typename Scalar> void check_hyperbolic_functions()
VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
}
+template <typename Scalar>
+void check_limits_specialization()
+{
+ typedef Eigen::Matrix<Scalar, 1, 1> Deriv;
+ typedef Eigen::AutoDiffScalar<Deriv> AD;
+
+ typedef std::numeric_limits<AD> A;
+ typedef std::numeric_limits<Scalar> B;
+
+#if EIGEN_HAS_CXX11
+ VERIFY(bool(std::is_base_of<B, A>::value));
+#endif
+}
+
void test_autodiff_scalar()
{
for(int i = 0; i < g_repeat; i++) {
@@ -79,5 +93,6 @@ void test_autodiff_scalar()
CALL_SUBTEST_2( check_atan2<double>() );
CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
+ CALL_SUBTEST_5( check_limits_specialization<double>());
}
}
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 653443dc5..3d73d491a 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -12,9 +12,6 @@
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 000000000..521a7f82c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -0,0 +1,245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
+
+ Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
+ Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+ Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+ in.setRandom();
+ in *= in.constant(100.0);
+ in(0, 0, 0) = -1000.0;
+ in(1, 1, 1) = 1000.0;
+
+ std::size_t in_bytes = in.size() * sizeof(DataType);
+ std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+ DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+ DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+ sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
+
+ gpu_out_max.device(sycl_device) = gpu_in.argmax();
+ gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+ sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+ sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+ VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
+ VERIFY_IS_EQUAL(out_min(), 0);
+
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out_max);
+ sycl_device.deallocate(d_out_min);
+}
+
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
+{
+ DenseIndex sizeDim0=9;
+ DenseIndex sizeDim1=3;
+ DenseIndex sizeDim2=5;
+ DenseIndex sizeDim3=7;
+ Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+
+ std::vector<DenseIndex> dims;
+ dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+ for (DenseIndex dim = 0; dim < 4; ++dim) {
+
+ array<DenseIndex, 3> out_shape;
+ for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+ Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+ tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(DataType);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+
+ DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+ size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ sycl_device.synchronize();
+
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+ tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
+ }
+ }
+ }
+ }
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out);
+ }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
+{
+ DenseIndex sizeDim0=9;
+ DenseIndex sizeDim1=3;
+ DenseIndex sizeDim2=5;
+ DenseIndex sizeDim3=7;
+ Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+
+ std::vector<DenseIndex> dims;
+ dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+ for (DenseIndex dim = 0; dim < 4; ++dim) {
+
+ array<DenseIndex, 3> out_shape;
+ for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+ Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+ tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(DataType);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+
+ DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+ size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ sycl_device.synchronize();
+
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+ tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
+ }
+ }
+ }
+ }
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out);
+ }
+}
+
+
+
+
+template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_argmax_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
+ }
+
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index 88c233994..816e03220 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -13,9 +13,6 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
index d4e111f5d..a52350f85 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -11,9 +11,6 @@
#define EIGEN_TEST_FUNC cxx11_tensor_complex
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
@@ -107,6 +104,41 @@ static void test_cuda_sum_reductions() {
gpu_device.deallocate(gpu_out_ptr);
}
+static void test_cuda_mean_reductions() {
+
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ const int num_rows = internal::random<int>(1024, 5*1024);
+ const int num_cols = internal::random<int>(1024, 5*1024);
+
+ Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+ in.setRandom();
+
+ Tensor<std::complex<float>, 0> full_redux;
+ full_redux = in.mean();
+
+ std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+ std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+ std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+ std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+ gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+ TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+ out_gpu.device(gpu_device) = in_gpu.mean();
+
+ Tensor<std::complex<float>, 0> full_redux_gpu;
+ gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+ gpu_device.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+ gpu_device.deallocate(gpu_in_ptr);
+ gpu_device.deallocate(gpu_out_ptr);
+}
static void test_cuda_product_reductions() {
@@ -149,5 +181,6 @@ void test_cxx11_tensor_complex()
{
CALL_SUBTEST(test_cuda_nullary());
CALL_SUBTEST(test_cuda_sum_reductions());
+ CALL_SUBTEST(test_cuda_mean_reductions());
CALL_SUBTEST(test_cuda_product_reductions());
}
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
index 2baf5eaad..aac780905 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -11,9 +11,6 @@
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index dd68430ce..3621e2aa6 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -14,12 +14,10 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 0ba9d52e9..9584a539f 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -12,9 +12,6 @@
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 000000000..9ff287fff
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+ DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+ DSizes<DenseIndex, 2> result;
+ result[0] = input.dimension(0) * 2;
+ result[1] = input.dimension(1) * 2;
+ return result;
+ }
+
+ template <typename Output, typename Device>
+ void eval(const TensorType& input, Output& output, const Device& device) const
+ {
+ array<DenseIndex, 2> strides;
+ strides[0] = 2;
+ strides[1] = 2;
+ output.stride(strides).device(device) = input;
+
+ Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+ Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+ output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+ }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+ IndexType sizeDim1 = 3;
+ IndexType sizeDim2 = 5;
+ Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+ Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+ typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+ TensorType gpu_in1(gpu_in1_data, tensorRange);
+ TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+ in1.setRandom();
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(out.dimension(0), 6);
+ VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+ for (int i = 0; i < 6; i+=2) {
+ for (int j = 0; j < 10; j+=2) {
+ VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+ }
+ }
+ for (int i = 1; i < 6; i+=2) {
+ for (int j = 1; j < 10; j+=2) {
+ VERIFY_IS_EQUAL(out(i, j), 0);
+ }
+ }
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+ DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+ DSizes<DenseIndex, 3> result;
+ result[0] = input1.dimension(0);
+ result[1] = input2.dimension(1);
+ result[2] = input2.dimension(2);
+ return result;
+ }
+
+ template <typename Output, typename Device>
+ void eval(const TensorType& input1, const TensorType& input2,
+ Output& output, const Device& device) const
+ {
+ typedef typename TensorType::DimensionPair DimPair;
+ array<DimPair, 1> dims;
+ dims[0] = DimPair(1, 0);
+ for (int64_t i = 0; i < output.dimension(2); ++i) {
+ output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+ }
+ }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+ Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+ Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+ Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}};
+
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+ typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+ TensorType gpu_in1(gpu_in1_data, tensorRange1);
+ TensorType gpu_in2(gpu_in2_data, tensorRange2);
+ TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+ in1.setRandom();
+ in2.setRandom();
+
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+ gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+ for (IndexType i = 0; i < 5; ++i) {
+ typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+ array<DimPair, 1> dims;
+ dims[0] = DimPair(1, 0);
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+ TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+ for (IndexType j = 0; j < 2; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_APPROX(val(j, k), reference(j, k));
+ }
+ }
+ }
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+void test_cxx11_tensor_custom_op_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(custom_op_perDevice<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index fde20ddf2..7c14bc187 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -13,12 +13,10 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+
using Eigen::Tensor;
using Eigen::RowMajor;
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index aca036cde..a21514d56 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
/// c=(a+b)*b
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 000000000..f551c8d0c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+ Generator1D() { }
+
+ float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+ return coordinates[0];
+ }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 6;
+ array<IndexType, 1> tensorRange = {{sizeDim1}};
+ Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+ Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+ DataType* gpu_data_vec = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < 6; ++i) {
+ VERIFY_IS_EQUAL(result(i), i);
+ }
+}
+
+
+struct Generator2D {
+ Generator2D() { }
+
+ float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+ return 3 * coordinates[0] + 11 * coordinates[1];
+ }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 5;
+ IndexType sizeDim2 = 7;
+ array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+ Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+ Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+ DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < 5; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+ }
+ }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType rows = 32;
+ IndexType cols = 48;
+ array<DataType, 2> means;
+ means[0] = rows / 2.0f;
+ means[1] = cols / 2.0f;
+ array<DataType, 2> std_devs;
+ std_devs[0] = 3.14f;
+ std_devs[1] = 2.7f;
+ internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+ array<IndexType, 2> tensorRange = {{rows, cols}};
+ Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+ Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+ DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < rows; ++i) {
+ for (IndexType j = 0; j < cols; ++j) {
+ DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+ DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+ DataType gaussian = expf(-g_rows - g_cols);
+ Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+ }
+ }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_generator_sycl()
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 000000000..eea18ec70
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_image_patch_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+ tensor_col_major.setRandom();
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Single pixel patch: ColMajor
+ array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+ gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+ // Single pixel patch: RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ // ColMajor
+ if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+ std::cout << "Mismatch detected at index colmajor " << i << " : "
+ << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+ // RowMajor
+ if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+ std::cout << "Mismatch detected at index row major" << i << " : "
+ << tensor_row_major.data()[i] << " vs "
+ << single_patch_row_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+ tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+ single_patch_row_major.data()[i]);
+ }
+
+
+ // Entire image patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+ // Entire image patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+ gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType c = 0; c < 5; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ for (IndexType b = 0; b < 7; ++b) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+ expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+ expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+ }
+ // ColMajor
+ if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+ expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j
+ << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+ expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // 2D patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+ gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+ // 2D patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+ gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+ IndexType stride = 1;
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 2; ++r) {
+ for (IndexType c = 0; c < 2; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ for (IndexType b = 0; b < 7; ++b) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ // ColMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ }
+ if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+ // RowMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+ }
+ if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_patch_row_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+ sycl_device.deallocate(gpu_data_twod_patch_col_major);
+ sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 3;
+ IndexType input_rows = 3;
+ IndexType input_cols = 3;
+ IndexType input_batches = 1;
+ IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Initializes tensor with incrementing numbers.
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ tensor_col_major.data()[i] = i + 1;
+ }
+ // ColMajor
+ array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+ Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+ gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // No padding is carried out.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r + i - row_padding;
+ IndexType col_offset = c + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_result_col_major);
+ sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 1;
+ IndexType input_rows = 5;
+ IndexType input_cols = 5;
+ IndexType input_batches = 2;
+ IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+ // ColMajor
+
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+ gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+ Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+ gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // No padding is carried out.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r + i - row_padding;
+ IndexType col_offset = c + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 3;
+ IndexType input_rows = 4;
+ IndexType input_cols = 2;
+ IndexType input_batches = 1;
+ IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+
+ // ColMajor
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Initializes tensor with incrementing numbers.
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ tensor_col_major.data()[i] = i + 1;
+ }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+
+ // ColMajor
+ array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ tensor_col_major.setRandom();
+ Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+ // Single pixel patch: ColMajor
+ array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+ gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+ // Single pixel patch: RowMajor
+ array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+ Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ // ColMajor
+ if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+ std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+ // RowMajor
+ if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+ std::cout << "Mismatch detected at index " << i << " : "
+ << tensor_col_major.data()[i] << " vs "
+ << single_patch_row_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+ tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+ single_patch_row_major.data()[i]);
+ }
+
+ // Entire image patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+ // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType c = 0; c < 5; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+ expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+ expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+ }
+ // ColMajor
+ if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+ // RowMajor
+ if (entire_image_patch_row_major(patchId, c, r, d) !=
+ expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+ expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+
+ // 2D patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+ gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+ // 2D patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+ Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+ gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+ IndexType stride = 1;
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 2; ++r) {
+ for (IndexType c = 0; c < 2; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ // ColMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset);
+ }
+ if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+ // RowMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+ expected_row_major = tensor_row_major(col_offset, row_offset, d);
+ }
+ if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_patch_row_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+ sycl_device.deallocate(gpu_data_twod_patch_col_major);
+ sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ // Test the code on typical configurations used by the 'imagenet' benchmarks at
+ // https://github.com/soumith/convnet-benchmarks
+ // ColMajor
+ IndexType sizeDim1 = 3;
+ IndexType sizeDim2 = 128;
+ IndexType sizeDim3 = 128;
+ IndexType sizeDim4 = 16;
+ array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+ l_in_col_major.setRandom();
+
+ DataType* gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+ array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+ size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+ gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+ // RowMajor
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+ for (IndexType b = 0; b < 16; ++b) {
+ for (IndexType i = 0; i < 128; ++i) {
+ for (IndexType j = 0; j < 128; ++j) {
+ IndexType patchId = i+128*j;
+ for (IndexType c = 0; c < 11; ++c) {
+ for (IndexType r = 0; r < 11; ++r) {
+ for (IndexType d = 0; d < 3; ++d) {
+ DataType expected = 0.0f;
+ if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+ expected = l_in_col_major(d, r-5+i, c-5+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) !=
+ expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j
+ << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+ expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 16;
+ sizeDim2 = 64;
+ sizeDim3 = 64;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 64; ++i) {
+ for (IndexType j = 0; j < 64; ++j) {
+ IndexType patchId = i+64*j;
+ for (IndexType c = 0; c < 9; ++c) {
+ for (IndexType r = 0; r < 9; ++r) {
+ for (IndexType d = 0; d < 16; ++d) {
+ DataType expected = 0.0f;
+ if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+ expected = l_in_col_major(d, r-4+i, c-4+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 32;
+ sizeDim2 = 16;
+ sizeDim3 = 16;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+ // RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 16; ++i) {
+ for (IndexType j = 0; j < 16; ++j) {
+ IndexType patchId = i+16*j;
+ for (IndexType c = 0; c < 7; ++c) {
+ for (IndexType r = 0; r < 7; ++r) {
+ for (IndexType d = 0; d < 32; ++d) {
+ DataType expected = 0.0f;
+ if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+ expected = l_in_col_major(d, r-3+i, c-3+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 64;
+ sizeDim2 = 13;
+ sizeDim3 = 13;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+ // RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 13; ++i) {
+ for (IndexType j = 0; j < 13; ++j) {
+ IndexType patchId = i+13*j;
+ for (IndexType c = 0; c < 3; ++c) {
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType d = 0; d < 64; ++d) {
+ DataType expected = 0.0f;
+ if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+ expected = l_in_col_major(d, r-1+i, c-1+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_image_patch_sycl()
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 000000000..f2f87f7ed
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Defenition for each dimention the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+ tensor.setRandom();
+
+ array<IndexType, 4> strides;
+ strides[0] = 1;
+ strides[1] = 1;
+ strides[2] = 1;
+ strides[3] = 1;
+
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_no_stride = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+ VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+
+ IndexType inflatedSizeDim1 = 3;
+ IndexType inflatedSizeDim2 = 9;
+ IndexType inflatedSizeDim3 = 9;
+ IndexType inflatedSizeDim4 = 19;
+ array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+ Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+ const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+ DataType* gpu_data_inflated = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+ gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+ sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+ VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+ VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+ VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+ VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+ for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+ for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+ for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+ for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+ if (i % strides[0] == 0 &&
+ j % strides[1] == 0 &&
+ k % strides[2] == 0 &&
+ l % strides[3] == 0) {
+ VERIFY_IS_EQUAL(inflated(i,j,k,l),
+ tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+ } else {
+ VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_no_stride);
+ sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_inflation_sycl()
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 000000000..9e8db8b4b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 7;
+ array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+ Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+ Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+ tensor1.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+// Tensor<float, 3, ColMajor> tensor(2,3,7);
+ //tensor.setRandom();
+
+// Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+ VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+ VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+ VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 7;
+ array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+ Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+ Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+ tensor1.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+ gpu2.swap_layout().device(sycl_device)=gpu1;
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+// Tensor<float, 3, ColMajor> tensor(2,3,7);
+// tensor.setRandom();
+
+ //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+// tensor2.swap_layout() = tensor;
+ VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+ VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+ VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+ test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_layout_swap_sycl()
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 908a5e5a9..167b75d25 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,12 +13,10 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+
using Eigen::Tensor;
template<typename>
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 000000000..88a29cb31
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ array<IndexType, 5> patchTensorRange;
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+ }else{
+ patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+ }
+
+ Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+ tensor.setRandom();
+
+ array<ptrdiff_t, 4> patch_dims;
+ patch_dims[0] = 1;
+ patch_dims[1] = 1;
+ patch_dims[2] = 1;
+ patch_dims[3] = 1;
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_no_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+ } else {
+ VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+ VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+ }
+
+ for (int i = 0; i < tensor.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+ }
+
+ patch_dims[0] = 2;
+ patch_dims[1] = 3;
+ patch_dims[2] = 5;
+ patch_dims[3] = 7;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+ }else{
+ patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+ patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+ gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+ VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+ VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+ VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+ VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+ } else {
+ VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+ VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+ VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+ }
+
+ for (int i = 0; i < tensor.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+ }
+ patch_dims[0] = 1;
+ patch_dims[1] = 2;
+ patch_dims[2] = 2;
+ patch_dims[3] = 1;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1,2,2,1,2*2*4*7}};
+ }else{
+ patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+ patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+ gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+ } else {
+ VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+ VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 4; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ int patch_loc;
+ if (DataLayout == ColMajor) {
+ patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+ } else {
+ patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+ }
+ for (int x = 0; x < 2; ++x) {
+ for (int y = 0; y < 2; ++y) {
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ patch_dims[0] = 1;
+ patch_dims[1] = 2;
+ patch_dims[2] = 3;
+ patch_dims[3] = 5;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1,2,3,5,2*2*3*3}};
+ }else{
+ patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+ patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+ DataType* gpu_data_threed_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+ gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+ VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+ } else {
+ VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 3; ++k) {
+ for (int l = 0; l < 3; ++l) {
+ int patch_loc;
+ if (DataLayout == ColMajor) {
+ patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+ } else {
+ patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+ }
+ for (int x = 0; x < 2; ++x) {
+ for (int y = 0; y < 3; ++y) {
+ for (int z = 0; z < 5; ++z) {
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_no_patch);
+ sycl_device.deallocate(gpu_data_single_patch);
+ sycl_device.deallocate(gpu_data_twod_patch);
+ sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_patch_sycl()
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index b3be199e1..fa1a46732 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -13,9 +13,6 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index 6858b43a7..ec0669704 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -12,9 +12,6 @@
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
index 5f146f3c9..1d4edef11 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -13,12 +13,10 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 000000000..340d1211c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_trace.cpp
@@ -0,0 +1,171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+ Tensor<float, 0, DataLayout> tensor;
+ tensor.setRandom();
+ array<ptrdiff_t, 0> dims;
+ Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+ VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+ Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+ tensor1.setRandom();
+ Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+ VERIFY_IS_EQUAL(result1.rank(), 0);
+ float sum = 0.0f;
+ for (int i = 0; i < 5; ++i) {
+ sum += tensor1(i, i, i);
+ }
+ VERIFY_IS_EQUAL(result1(), sum);
+
+ Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+ array<ptrdiff_t, 5> dims({{2, 1, 0, 3, 4}});
+ Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+ VERIFY_IS_EQUAL(result2.rank(), 0);
+ sum = 0.0f;
+ for (int i = 0; i < 7; ++i) {
+ sum += tensor2(i, i, i, i, i);
+ }
+ VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+ Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+ tensor1.setRandom();
+ array<ptrdiff_t, 2> dims1({{0, 2}});
+ Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+ VERIFY_IS_EQUAL(result1.rank(), 1);
+ VERIFY_IS_EQUAL(result1.dimension(0), 5);
+ float sum = 0.0f;
+ for (int i = 0; i < 5; ++i) {
+ sum = 0.0f;
+ for (int j = 0; j < 3; ++j) {
+ sum += tensor1(j, i, j);
+ }
+ VERIFY_IS_EQUAL(result1(i), sum);
+ }
+
+ Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+ tensor2.setRandom();
+ array<ptrdiff_t, 2> dims2({{2, 3}});
+ Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+ VERIFY_IS_EQUAL(result2.rank(), 2);
+ VERIFY_IS_EQUAL(result2.dimension(0), 5);
+ VERIFY_IS_EQUAL(result2.dimension(1), 5);
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 7; ++k) {
+ sum += tensor2(i, j, k, k);
+ }
+ VERIFY_IS_EQUAL(result2(i, j), sum);
+ }
+ }
+
+ array<ptrdiff_t, 2> dims3({{1, 0}});
+ Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+ VERIFY_IS_EQUAL(result3.rank(), 2);
+ VERIFY_IS_EQUAL(result3.dimension(0), 7);
+ VERIFY_IS_EQUAL(result3.dimension(1), 7);
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 5; ++k) {
+ sum += tensor2(k, k, i, j);
+ }
+ VERIFY_IS_EQUAL(result3(i, j), sum);
+ }
+ }
+
+ Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+ tensor3.setRandom();
+ array<ptrdiff_t, 3> dims4({{0, 2, 4}});
+ Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+ VERIFY_IS_EQUAL(result4.rank(), 2);
+ VERIFY_IS_EQUAL(result4.dimension(0), 7);
+ VERIFY_IS_EQUAL(result4.dimension(1), 7);
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 3; ++k) {
+ sum += tensor3(k, i, k, j, k);
+ }
+ VERIFY_IS_EQUAL(result4(i, j), sum);
+ }
+ }
+
+ Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+ tensor4.setRandom();
+ array<ptrdiff_t, 2> dims5({{1, 3}});
+ Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+ VERIFY_IS_EQUAL(result5.rank(), 3);
+ VERIFY_IS_EQUAL(result5.dimension(0), 3);
+ VERIFY_IS_EQUAL(result5.dimension(1), 4);
+ VERIFY_IS_EQUAL(result5.dimension(2), 5);
+ for (int i = 0; i < 3; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ sum = 0.0f;
+ for (int l = 0; l < 7; ++l) {
+ sum += tensor4(i, l, j, l, k);
+ }
+ VERIFY_IS_EQUAL(result5(i, j, k), sum);
+ }
+ }
+ }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+ Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+ tensor.setRandom();
+ array<ptrdiff_t, 2> dims({{1, 3}});
+ Tensor<float, 2, DataLayout> result(2, 5);
+ result = result.constant(1.0f) - tensor.trace(dims);
+ VERIFY_IS_EQUAL(result.rank(), 2);
+ VERIFY_IS_EQUAL(result.dimension(0), 2);
+ VERIFY_IS_EQUAL(result.dimension(1), 5);
+ float sum = 0.0f;
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 3; ++k) {
+ sum += tensor(i, k, j, k);
+ }
+ VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+ }
+ }
+}
+
+
+void test_cxx11_tensor_trace() {
+ CALL_SUBTEST(test_0D_trace<ColMajor>());
+ CALL_SUBTEST(test_0D_trace<RowMajor>());
+ CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+ CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+ CALL_SUBTEST(test_simple_trace<ColMajor>());
+ CALL_SUBTEST(test_simple_trace<RowMajor>());
+ CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+ CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 000000000..039715abc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_volume_patch_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+ // single volume patch: ColMajor
+ array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_voxel_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+ gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+ sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+ array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+ Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_voxel_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+ sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ }
+
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ const int depth = 4;
+ const int patch_z = 2;
+ const int patch_y = 3;
+ const int patch_x = 5;
+ const int batch = 7;
+
+ array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+ array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+ Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+ tensor_col_major.setRandom();
+
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+ // single volume patch: ColMajor
+ array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+ Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_volume_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+ sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+// Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+// tensor.setRandom();
+// Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+ //Tensor<float, 6> entire_volume_patch;
+ //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+// Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+ //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+ array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+ Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_volume_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+ gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+ sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+ const int dz = patch_z - 1;
+ const int dy = patch_y - 1;
+ const int dx = patch_x - 1;
+
+ const int forward_pad_z = dz - dz / 2;
+ const int forward_pad_y = dy - dy / 2;
+ const int forward_pad_x = dx - dx / 2;
+
+ for (int pz = 0; pz < patch_z; pz++) {
+ for (int py = 0; py < patch_y; py++) {
+ for (int px = 0; px < patch_x; px++) {
+ const int patchId = pz + patch_z * (py + px * patch_y);
+ for (int z = 0; z < patch_z; z++) {
+ for (int y = 0; y < patch_y; y++) {
+ for (int x = 0; x < patch_x; x++) {
+ for (int b = 0; b < batch; b++) {
+ for (int d = 0; d < depth; d++) {
+ float expected = 0.0f;
+ float expected_row_major = 0.0f;
+ const int eff_z = z - forward_pad_z + pz;
+ const int eff_y = y - forward_pad_y + py;
+ const int eff_x = x - forward_pad_x + px;
+ if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+ eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+ expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+ expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+ }
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_volume_patch_sycl()
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}