From b1e312edd607bcfa99192d53f55b2ac974644c44 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 15 Feb 2017 10:13:01 +0000 Subject: Adding TensorPatch.h for sycl backend. --- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 11 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_patch_sycl.cpp | 252 +++++++++++++++++++++++ 3 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_patch_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f6..cead2eac8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -99,11 +99,11 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), patch_dims(op.patch_dims()) { Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const PatchDim& patch_dims = op.patch_dims(); + // const PatchDim& patch_dims = op.patch_dims(); if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims-1; ++i) { m_dimensions[i] = patch_dims[i]; @@ -255,6 +255,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } + protected: Dimensions m_dimensions; array m_outputStrides; @@ -262,6 +267,8 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; + // required by sycl + const PatchDim& patch_dims; }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 003c9de0b..d01233fb2 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -167,6 +167,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp new file mode 100644 index 000000000..b75219a5b --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -0,0 +1,252 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include + +using Eigen::Tensor; + +template +static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array patchTensorRange; + if (DataLayout == ColMajor) { + patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}}; + }else{ + patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}}; + } + + Tensor tensor(tensorRange); + Tensor no_patch(patchTensorRange); + + tensor.setRandom(); + + array patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_no_patch(gpu_data_no_patch, patchTensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + } else { + VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size()); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), 1); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + if (DataLayout == ColMajor) { + patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; + }else{ + patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}}; + } + Tensor single_patch(patchTensorRange); + patchTensorBuffSize =single_patch.size()*sizeof(DataType); + DataType* gpu_data_single_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch(gpu_data_single_patch, patchTensorRange); + + gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + } else { + VERIFY_IS_EQUAL(single_patch.dimension(0), 1); + VERIFY_IS_EQUAL(single_patch.dimension(1), 2); + VERIFY_IS_EQUAL(single_patch.dimension(2), 3); + VERIFY_IS_EQUAL(single_patch.dimension(3), 5); + VERIFY_IS_EQUAL(single_patch.dimension(4), 7); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + + + + + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,2,1,2*2*4*7}}; + }else{ + patchTensorRange = {{2*2*4*7, 1, 2,2,1}}; + } + Tensor twod_patch(patchTensorRange); + patchTensorBuffSize =twod_patch.size()*sizeof(DataType); + DataType* gpu_data_twod_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange); + + gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + } else { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 1); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + } else { + patch_loc = l + 7 * (k + 4 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0)); + } + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,3,5,2*2*3*3}}; + }else{ + patchTensorRange = {{2*2*3*3, 1, 2,3,5}}; + } + Tensor threed_patch(patchTensorRange); + patchTensorBuffSize =threed_patch.size()*sizeof(DataType); + DataType* gpu_data_threed_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange); + + gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + } else { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 5); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + } else { + patch_loc = l + 3 * (k + 3 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z)); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_patch); + sycl_device.deallocate(gpu_data_single_patch); + sycl_device.deallocate(gpu_data_twod_patch); + sycl_device.deallocate(gpu_data_threed_patch); +} + +template void sycl_tensor_patch_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_patch_sycl(sycl_device); + test_simple_patch_sycl(sycl_device); +} +void test_cxx11_tensor_patch_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_patch_test_per_device(device)); + } +} -- cgit v1.2.3 From 91982b91c02deb5e1ce557bbc5c96fee19c636ed Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 15 Feb 2017 16:28:12 +0000 Subject: Adding TensorLayoutSwapOp for sycl. --- .../Tensor/TensorSyclConvertToDeviceExpression.h | 29 +++-- .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 21 +++- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 15 +++ .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 29 ++--- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 14 ++- .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 17 +-- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_layout_swap_sycl.cpp | 126 +++++++++++++++++++++ unsupported/test/cxx11_tensor_patch_sycl.cpp | 9 +- 9 files changed, 217 insertions(+), 44 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_layout_swap_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9c2..ff5097141 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -91,27 +91,34 @@ ASSIGNCONVERT(, false) #undef ASSIGNCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is either TensorForcedEvalOp or TensorEvalToOp +/// type is TensorEvalToOp #define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ template \ struct ConvertToDeviceExpression > \ : DeviceConvertor{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ + +KERNELBROKERCONVERT(const, true, TensorEvalToOp) +KERNELBROKERCONVERT(, false, TensorEvalToOp) +#undef KERNELBROKERCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression::Type> Type;\ +struct ConvertToDeviceExpression > {\ + typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ }; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL +// TensorForcedEvalOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) + +// TensorLayoutSwapOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d2c..6b6093fa3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -223,7 +223,7 @@ struct ExprConstructor, CVQua Type expr;\ template \ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ + : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ }; EVALTO(const) @@ -386,6 +386,25 @@ SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR + +// TensorLayoutSwapOp +#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ +template\ +struct ExprConstructor , CVQual TensorLayoutSwapOp, Params... >{\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorLayoutSwapOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ +}; + +SYCLTENSORLAYOUTSWAPOPEXPR(const) +SYCLTENSORLAYOUTSWAPOPEXPR() +#undef SYCLTENSORLAYOUTSWAPOPEXPR + + /// template deduction for \ref ExprConstructor struct template auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple &t) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43f6..213dd25ea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -226,6 +226,21 @@ SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorLayoutSwapOp. +#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORLAYOUTSWAPOPEXTACC(const) +SYCLTENSORLAYOUTSWAPOPEXTACC() +#undef SYCLTENSORLAYOUTSWAPOPEXTACC + + + /// template deduction for \ref ExtractAccessor template auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee020184b..1506e8189 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -39,7 +39,6 @@ template struct FunctorExtractor{ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } FunctorExtractor(const Evaluator& expr) : m_dimensions(expr.dimensions()) {} - }; /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything @@ -143,19 +142,23 @@ SYCLEXTRFUNCASSIGNOP(const) SYCLEXTRFUNCASSIGNOP() #undef SYCLEXTRFUNCASSIGNOP -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorEvalToOp, This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOP(CVQual)\ -template \ -struct FunctorExtractor, Dev> > {\ - FunctorExtractor > rhsExpr;\ - FunctorExtractor(const TensorEvaluator, Dev>& expr)\ - : rhsExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node types are +/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ +template \ +struct FunctorExtractor, Dev> > {\ + FunctorExtractor > xprExpr;\ + FunctorExtractor(const TensorEvaluator, Dev>& expr)\ + : xprExpr(expr.impl()) {}\ }; - -SYCLEXTRFUNCEVALTOOP(const) -SYCLEXTRFUNCEVALTOOP() -#undef SYCLEXTRFUNCEVALTOOP +//TensorEvalToOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) +// TensorLayoutSwapOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) + +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT template struct DimConstr { template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a1c112f4d..15729310d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -94,15 +94,17 @@ SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLEAFCOUNT(CVQual)\ +#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ template \ -struct LeafCount > {\ - static const size_t Count = 1 + CategoryCount::Count;\ +struct LeafCount > {\ + static const size_t Count = Num + CategoryCount::Count;\ }; -EVALTOLEAFCOUNT(const) -EVALTOLEAFCOUNT() -#undef EVALTOLEAFCOUNT +EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) +#undef EVALTOLAYOUTSWAPLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp #define REDUCTIONLEAFCOUNT(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 74566dcee..ba0d17e0c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -144,16 +144,19 @@ FORCEDEVAL() #undef FORCEDEVAL /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp -#define EVALTO(CVQual)\ +/// TensorEvalToOp, TensorLayoutSwapOp +#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ template \ -struct PlaceHolderExpression, N> {\ - typedef CVQual TensorEvalToOp::ArgType> Type;\ +struct PlaceHolderExpression, N> {\ + typedef CVQual ExprNode::ArgType> Type;\ }; -EVALTO(const) -EVALTO() -#undef EVALTO +EVALTOLAYOUTSWAP(const, TensorEvalToOp) +EVALTOLAYOUTSWAP(, TensorEvalToOp) +EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) + +#undef EVALTOLAYOUTSWAP /// specialisation of the \ref PlaceHolderExpression when the node is diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d01233fb2..57580f805 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -168,6 +168,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp new file mode 100644 index 000000000..9e8db8b4b --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include + +using Eigen::Tensor; + +template +static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + + Tensor tensor1(tensorColRange); + Tensor tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorColRange); + TensorMap> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor tensor(2,3,7); + //tensor.setRandom(); + +// Tensor tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template +static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + Tensor tensor1(tensorColRange); + Tensor tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorColRange); + TensorMap> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.swap_layout().device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor tensor(2,3,7); +// tensor.setRandom(); + + //Tensor tensor2(7,3,2); +// tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + + +template void sycl_tensor_layout_swap_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_swap_sycl(sycl_device); + test_swap_as_lvalue_sycl(sycl_device); +} +void test_cxx11_tensor_layout_swap_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp index b75219a5b..88a29cb31 100644 --- a/unsupported/test/cxx11_tensor_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -12,7 +12,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl @@ -80,10 +79,12 @@ static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ for (int i = 0; i < tensor.size(); ++i) { VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); } + patch_dims[0] = 2; patch_dims[1] = 3; patch_dims[2] = 5; patch_dims[3] = 7; + if (DataLayout == ColMajor) { patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; }else{ @@ -114,15 +115,11 @@ static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ for (int i = 0; i < tensor.size(); ++i) { VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); } - - - - - patch_dims[0] = 1; patch_dims[1] = 2; patch_dims[2] = 2; patch_dims[3] = 1; + if (DataLayout == ColMajor) { patchTensorRange = {{1,2,2,1,2*2*4*7}}; }else{ -- cgit v1.2.3 From 79ebc8f76137f151c78b4f61cd99fae62bf6c34f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 20 Feb 2017 12:11:05 +0000 Subject: Adding Sycl backend for TensorImagePatchOP.h; adding Sycl backend for TensorInflation.h. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 26 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 6 + .../Tensor/TensorSyclConvertToDeviceExpression.h | 14 + .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 18 + .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 14 + .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 36 +- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 10 + .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 14 + unsupported/test/CMakeLists.txt | 2 + .../test/cxx11_tensor_image_patchOP_sycl.cpp | 1092 ++++++++++++++++++++ unsupported/test/cxx11_tensor_inflation_sycl.cpp | 136 +++ 11 files changed, 1356 insertions(+), 12 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_inflation_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed2..2fb6b84b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,12 +70,8 @@ class TensorImagePatchOp : public TensorBase, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_op(op) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -241,6 +238,8 @@ struct TensorEvaluator, Device> break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warnig; + m_outputRows=0; //// silence the uninitialised warnig; } } eigen_assert(m_outputRows > 0); @@ -420,7 +419,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } + Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -501,6 +503,8 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; TensorEvaluator m_impl; + // required for sycl + const XprType& m_op; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9ee..b6bf05fed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -215,6 +215,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } + + protected: Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ff5097141..5b4a9af9f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -165,6 +165,20 @@ KERNELBROKERCONVERTCHIPPINGOP() + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp +#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ +template\ +struct ConvertToDeviceExpression >{\ + typedef CVQual TensorImagePatchOp::Type> Type;\ +}; +KERNELBROKERCONVERTIMAGEPATCHOP(const) +KERNELBROKERCONVERTIMAGEPATCHOP() +#undef KERNELBROKERCONVERTIMAGEPATCHOP + + + + } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 6b6093fa3..57a10d06b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -385,6 +385,24 @@ SYCLTENSORCHIPPINGOPEXPR(const) SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR +// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\ +template\ +struct ExprConstructor, CVQual TensorImagePatchOp, Params... > {\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorImagePatchOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value, funcD.m_padding_type, funcD.m_padding_explicit){}\ +}; + +SYCLTENSORIMAGEPATCHOPEXPR(const) +SYCLTENSORIMAGEPATCHOPEXPR() +#undef SYCLTENSORIMAGEPATCHOPEXPR // TensorLayoutSwapOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 213dd25ea..2be6f3710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -226,6 +226,20 @@ SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorImagePatchOp. +#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORIMAGEPATCHOPEXTACC(const) +SYCLTENSORIMAGEPATCHOPEXTACC() +#undef SYCLTENSORIMAGEPATCHOPEXTACC + + // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorLayoutSwapOp. #define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 1506e8189..dbac01138 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -296,7 +296,7 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) //TensorChippingOp #define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ template\ -struct FunctorExtractor, Device>>{\ +struct FunctorExtractor, Device> >{\ FunctorExtractor > xprExpr;\ const DenseIndex m_dim;\ const DenseIndex m_offset;\ @@ -310,6 +310,40 @@ SYCLEXTRFUNCCHIPPINGOP(const) SYCLEXTRFUNCCHIPPINGOP() #undef SYCLEXTRFUNCCHIPPINGOP +#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ +template\ +struct FunctorExtractor, Device> >{\ +typedef CVQual TensorImagePatchOp Self;\ +FunctorExtractor > xprExpr;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator& expr)\ +: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\ + m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\ + m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\ + m_padding_value(expr.xpr().padding_value()){}\ +}; + +SYCLEXTRFUNCIMAGEPATCHOP(const) +SYCLEXTRFUNCIMAGEPATCHOP() +#undef SYCLEXTRFUNCIMAGEPATCHOP /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 15729310d..b8e658824 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -161,6 +161,16 @@ SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT +#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ +template\ +struct LeafCount >:CategoryCount{}; + + +TENSORIMAGEPATCHOPLEAFCOUNT(const) +TENSORIMAGEPATCHOPLEAFCOUNT() +#undef TENSORIMAGEPATCHOPLEAFCOUNT + + } /// namespace TensorSycl } /// namespace internal } /// namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index ba0d17e0c..ab97235ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -221,6 +221,20 @@ SYCLSLICESTRIDEOPPLH() #undef SYCLSLICESTRIDEOPPLH + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOP(CVQual)\ +template\ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorImagePatchOp::ArgType> Type;\ +}; + +SYCLTENSORIMAGEPATCHOP(const) +SYCLTENSORIMAGEPATCHOP() +#undef SYCLTENSORIMAGEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template struct createPlaceHolderExpression { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 57580f805..282f9eb55 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -169,6 +169,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp new file mode 100644 index 000000000..ba6b2f15a --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp @@ -0,0 +1,1092 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_image_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index colmajor " << i << " : " + << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] + << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index row major" << i << " : " + << tensor_row_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); + + // Entire image patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; + Tensor entire_image_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); + gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); + expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(b, patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); + + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + } + if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); + + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + + } + if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); + +} + + +// Verifies VALID padding (no padding) with incrementing values. +template +static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 3; + IndexType input_cols = 3; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + // ColMajor + array patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_result_col_major); + sycl_device.deallocate(gpu_data_result_row_major); +} + +// Verifies VALID padding (no padding) with the same value. +template +static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 1; + IndexType input_rows = 5; + IndexType input_cols = 5; + IndexType input_batches = 2; + IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + // ColMajor + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + array patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + +// Verifies SAME padding. +template +static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 4; + IndexType input_cols = 2; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + // ColMajor + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + +array patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; +Tensor result_col_major(patchColMajorTensorRange); +size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); +DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); +gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); +sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + + array patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + + +template +static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + + // ColMajor + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + tensor_col_major.setRandom(); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); + + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " + << tensor_col_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + + // Entire image patch: RowMajor +patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; +Tensor entire_image_patch_row_major(patchRowMajorTensorRange); +patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); +DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); +gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); +sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j); + expected_row_major = tensor_row_major(c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset); + } + if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { + expected_row_major = tensor_row_major(col_offset, row_offset, d); + } + if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); +} + +template +static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + // ColMajor + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 128; + IndexType sizeDim3 = 128; + IndexType sizeDim4 = 16; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor l_in_col_major(tensorColMajorRange); + l_in_col_major.setRandom(); + + DataType* gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + + array patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; + Tensor l_out_col_major(patchTensorRange); + size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); + gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); + + // RowMajor + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; + Tensor l_out_row_major(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); + + for (IndexType b = 0; b < 16; ++b) { + for (IndexType i = 0; i < 128; ++i) { + for (IndexType j = 0; j < 128; ++j) { + int patchId = i+128*j; + for (IndexType c = 0; c < 11; ++c) { + for (IndexType r = 0; r < 11; ++r) { + for (IndexType d = 0; d < 3; ++d) { + DataType expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in_col_major(d, r-5+i, c-5+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != + expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), + expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 16; + sizeDim2 = 64; + sizeDim3 = 64; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + +// RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 64; ++i) { + for (IndexType j = 0; j < 64; ++j) { + int patchId = i+64*j; + for (IndexType c = 0; c < 9; ++c) { + for (IndexType r = 0; r < 9; ++r) { + for (IndexType d = 0; d < 16; ++d) { + DataType expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in_col_major(d, r-4+i, c-4+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 32; + sizeDim2 = 16; + sizeDim3 = 16; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 16; ++i) { + for (IndexType j = 0; j < 16; ++j) { + int patchId = i+16*j; + for (IndexType c = 0; c < 7; ++c) { + for (IndexType r = 0; r < 7; ++r) { + for (IndexType d = 0; d < 32; ++d) { + DataType expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in_col_major(d, r-3+i, c-3+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 64; + sizeDim2 = 13; + sizeDim3 = 13; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 13; ++i) { + for (IndexType j = 0; j < 13; ++j) { + int patchId = i+13*j; + for (IndexType c = 0; c < 3; ++c) { + for (IndexType r = 0; r < 3; ++r) { + for (IndexType d = 0; d < 64; ++d) { + DataType expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in_col_major(d, r-1+i, c-1+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sycl_device.deallocate(gpu_data_l_out_row_major); +} + + +template void sycl_tensor_image_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +test_simple_image_patch_sycl(sycl_device); +test_patch_padding_valid_sycl(sycl_device); +test_patch_padding_valid_same_value_sycl(sycl_device); +test_patch_padding_same_sycl(sycl_device); +test_patch_no_extra_dim_sycl(sycl_device); +test_imagenet_patches_sycl(sycl_device); +} +void test_cxx11_tensor_image_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); +} +} diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp new file mode 100644 index 000000000..f2f87f7ed --- /dev/null +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; + +// Inflation Defenition for each dimention the inflated val would be +//((dim-1)*strid[dim] +1) + +// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to +// tensor of size (2*3) +1 = 7 with the value of +// (4, 0, 0, 4, 0, 0, 4). + +template +void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) { + + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor tensor(tensorRange); + Tensor no_stride(tensorRange); + tensor.setRandom(); + + array strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_stride = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_no_stride(gpu_data_no_stride, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize); + + VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + IndexType inflatedSizeDim1 = 3; + IndexType inflatedSizeDim2 = 9; + IndexType inflatedSizeDim3 = 9; + IndexType inflatedSizeDim4 = 19; + array inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}}; + + Tensor inflated(inflatedTensorRange); + + const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType); + DataType* gpu_data_inflated = static_cast(sycl_device.allocate(inflatedTensorBuffSize)); + TensorMap> gpu_inflated(gpu_data_inflated, inflatedTensorRange); + gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize); + + VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1); + VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2); + VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3); + VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4); + + for (IndexType i = 0; i < inflatedSizeDim1; ++i) { + for (IndexType j = 0; j < inflatedSizeDim2; ++j) { + for (IndexType k = 0; k < inflatedSizeDim3; ++k) { + for (IndexType l = 0; l < inflatedSizeDim4; ++l) { + if (i % strides[0] == 0 && + j % strides[1] == 0 && + k % strides[2] == 0 && + l % strides[3] == 0) { + VERIFY_IS_EQUAL(inflated(i,j,k,l), + tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3])); + } else { + VERIFY_IS_EQUAL(0, inflated(i,j,k,l)); + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_stride); + sycl_device.deallocate(gpu_data_inflated); +} + +template void sycl_inflation_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_inflation_sycl(sycl_device); + test_simple_inflation_sycl(sycl_device); +} +void test_cxx11_tensor_inflation_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_inflation_test_per_device(device)); + } +} -- cgit v1.2.3 From 4f07ac16b0722597c55e2783cee33606a1f5e390 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 21 Feb 2017 10:09:47 +0000 Subject: Reducing the number of warnings. --- unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp index ba6b2f15a..e5ca4e388 100644 --- a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp +++ b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp @@ -134,7 +134,7 @@ static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 3; ++r) { for (IndexType c = 0; c < 5; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -206,7 +206,7 @@ static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 2; ++r) { for (IndexType c = 0; c < 2; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -323,7 +323,7 @@ static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols - int patchId = i+input_rows*j; + IndexType patchId = i+input_rows*j; for (IndexType r = 0; r < ksize; ++r) { // patch rows for (IndexType c = 0; c < ksize; ++c) { // patch cols for (IndexType d = 0; d < input_depth; ++d) { // depth @@ -529,7 +529,7 @@ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_majo for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols - int patchId = i+input_rows*j; + IndexType patchId = i+input_rows*j; for (IndexType r = 0; r < ksize; ++r) { // patch rows for (IndexType c = 0; c < ksize; ++c) { // patch cols for (IndexType d = 0; d < input_depth; ++d) { // depth @@ -667,7 +667,7 @@ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_ent for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 3; ++r) { for (IndexType c = 0; c < 5; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -731,7 +731,7 @@ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_ent for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 2; ++r) { for (IndexType c = 0; c < 2; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -824,7 +824,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 16; ++b) { for (IndexType i = 0; i < 128; ++i) { for (IndexType j = 0; j < 128; ++j) { - int patchId = i+128*j; + IndexType patchId = i+128*j; for (IndexType c = 0; c < 11; ++c) { for (IndexType r = 0; r < 11; ++r) { for (IndexType d = 0; d < 3; ++d) { @@ -899,7 +899,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 64; ++i) { for (IndexType j = 0; j < 64; ++j) { - int patchId = i+64*j; + IndexType patchId = i+64*j; for (IndexType c = 0; c < 9; ++c) { for (IndexType r = 0; r < 9; ++r) { for (IndexType d = 0; d < 16; ++d) { @@ -972,7 +972,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 16; ++i) { for (IndexType j = 0; j < 16; ++j) { - int patchId = i+16*j; + IndexType patchId = i+16*j; for (IndexType c = 0; c < 7; ++c) { for (IndexType r = 0; r < 7; ++r) { for (IndexType d = 0; d < 32; ++d) { @@ -1044,7 +1044,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 13; ++i) { for (IndexType j = 0; j < 13; ++j) { - int patchId = i+13*j; + IndexType patchId = i+13*j; for (IndexType c = 0; c < 3; ++c) { for (IndexType r = 0; r < 3; ++r) { for (IndexType d = 0; d < 64; ++d) { -- cgit v1.2.3 From 89dfd51fae868393b66b1949638e03de2ba17c1f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 22 Feb 2017 16:36:24 +0000 Subject: Adding Sycl Backend for TensorGenerator.h. --- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 13 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_generator_sycl.cpp | 147 +++++++++++++++++++++ 3 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_generator_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index eb1d4934e..ca87f493a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -97,10 +97,9 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + : m_generator(op.generator()), m_argImpl(op.expression(), device) { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); + m_dimensions = m_argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; @@ -155,6 +154,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } + + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { @@ -178,6 +183,8 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; array m_strides; Generator m_generator; + // required by sycl + TensorEvaluator m_argImpl; }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 282f9eb55..69c892362 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -171,6 +171,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp new file mode 100644 index 000000000..f551c8d0c --- /dev/null +++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL +static const float error_threshold =1e-8f; + +#include "main.h" +#include + +using Eigen::Tensor; +struct Generator1D { + Generator1D() { } + + float operator()(const array& coordinates) const { + return coordinates[0]; + } +}; + +template +static void test_1D_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 6; + array tensorRange = {{sizeDim1}}; + Tensor vec(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =vec.size()*sizeof(DataType); + DataType* gpu_data_vec = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_vec(gpu_data_vec, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(result(i), i); + } +} + + +struct Generator2D { + Generator2D() { } + + float operator()(const array& coordinates) const { + return 3 * coordinates[0] + 11 * coordinates[1]; + } +}; + +template +static void test_2D_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 7; + array tensorRange = {{sizeDim1, sizeDim2}}; + Tensor matrix(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 5; ++i) { + for (IndexType j = 0; j < 5; ++j) { + VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); + } + } +} + +template +static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType rows = 32; + IndexType cols = 48; + array means; + means[0] = rows / 2.0f; + means[1] = cols / 2.0f; + array std_devs; + std_devs[0] = 3.14f; + std_devs[1] = 2.7f; + internal::GaussianGenerator gaussian_gen(means, std_devs); + + array tensorRange = {{rows, cols}}; + Tensor matrix(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < rows; ++i) { + for (IndexType j = 0; j < cols; ++j) { + DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f; + DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f; + DataType gaussian = expf(-g_rows - g_cols); + Eigen::internal::isApprox(result(i, j), gaussian, error_threshold); + } + } +} + +template void sycl_generator_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_1D_sycl(sycl_device); + test_1D_sycl(sycl_device); + test_2D_sycl(sycl_device); + test_2D_sycl(sycl_device); + test_gaussian_sycl(sycl_device); + test_gaussian_sycl(sycl_device); +} +void test_cxx11_tensor_generator_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_generator_test_per_device(device)); + } +} -- cgit v1.2.3 From 0b7875f1376a0f3f22754837712ddd885ca3f4dd Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 24 Feb 2017 18:13:30 +0000 Subject: Converting fixed float type into template type for TensorContraction. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index e87de0c57..abc7ba551 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -230,13 +230,13 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS const Index nGroupId = itemID.get_group(1); // Work-group ID localCol const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + LhsScalar privateLhs; + RhsScalar privateRhs[WorkLoadPerThreadN]; + OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers for (Index wLPTM=0; wLPTM(0); } } -- cgit v1.2.3 From 2fa2b617a97ba254343c7c1635a9b6d617a100e8 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 24 Feb 2017 19:16:24 +0000 Subject: Adding TensorVolumePatchOP.h for sycl --- .../Tensor/TensorSyclConvertToDeviceExpression.h | 10 + .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 22 ++ .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 15 ++ .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 44 ++++ .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 13 +- .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 14 ++ .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 30 +-- unsupported/test/CMakeLists.txt | 5 +- .../test/cxx11_tensor_volume_patchOP_sycl.cpp | 222 +++++++++++++++++++++ 9 files changed, 359 insertions(+), 16 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 5b4a9af9f..dd63a2e2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -177,6 +177,16 @@ KERNELBROKERCONVERTIMAGEPATCHOP() #undef KERNELBROKERCONVERTIMAGEPATCHOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp +#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ +template\ +struct ConvertToDeviceExpression >{\ + typedef CVQual TensorVolumePatchOp::Type> Type;\ +}; +KERNELBROKERCONVERTVOLUMEPATCHOP(const) +KERNELBROKERCONVERTVOLUMEPATCHOP() +#undef KERNELBROKERCONVERTVOLUMEPATCHOP + } // namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 57a10d06b..117b368ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -404,6 +404,28 @@ SYCLTENSORIMAGEPATCHOPEXPR(const) SYCLTENSORIMAGEPATCHOPEXPR() #undef SYCLTENSORIMAGEPATCHOPEXPR +// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\ +template\ +struct ExprConstructor, CVQual TensorVolumePatchOp, Params... > {\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorVolumePatchOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value,\ + funcD.m_padding_type, funcD.m_padding_explicit){\ + }\ +}; + +SYCLTENSORVOLUMEPATCHOPEXPR(const) +SYCLTENSORVOLUMEPATCHOPEXPR() +#undef SYCLTENSORVOLUMEPATCHOPEXPR + + // TensorLayoutSwapOp #define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 2be6f3710..4a6322d44 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -240,6 +240,21 @@ SYCLTENSORIMAGEPATCHOPEXTACC() #undef SYCLTENSORIMAGEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorVolumePatchOp. +#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORVOLUMEPATCHOPEXTACC(const) +SYCLTENSORVOLUMEPATCHOPEXTACC() +#undef SYCLTENSORVOLUMEPATCHOPEXTACC + + // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorLayoutSwapOp. #define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index dbac01138..8828a0495 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -344,6 +344,50 @@ FunctorExtractor(const TensorEvaluator& expr)\ SYCLEXTRFUNCIMAGEPATCHOP(const) SYCLEXTRFUNCIMAGEPATCHOP() #undef SYCLEXTRFUNCIMAGEPATCHOP + +/// TensorVolumePatchOp +#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\ +template\ +struct FunctorExtractor, Device> >{\ +typedef CVQual TensorVolumePatchOp Self;\ +FunctorExtractor > xprExpr;\ +const DenseIndex m_patch_planes;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_plane_strides;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_plane_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_plane_inflate_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top_z;\ +const DenseIndex m_padding_bottom_z;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator& expr)\ +: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\ + m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\ + m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \ + m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\ +}; +SYCLEXTRFUNCVOLUMEPATCHOP(const) +SYCLEXTRFUNCVOLUMEPATCHOP() +#undef SYCLEXTRFUNCVOLUMEPATCHOP + + + /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index b8e658824..50f4595fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -151,7 +151,7 @@ CHIPPINGOPLEAFCOUNT(const) CHIPPINGOPLEAFCOUNT() #undef CHIPPINGOPLEAFCOUNT - +///TensorStridingSlicingOp #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template\ struct LeafCount >:CategoryCount{}; @@ -160,7 +160,7 @@ SLICESTRIDEOPLEAFCOUNT(const) SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT - +//TensorImagePatchOp #define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ template\ struct LeafCount >:CategoryCount{}; @@ -170,6 +170,15 @@ TENSORIMAGEPATCHOPLEAFCOUNT(const) TENSORIMAGEPATCHOPLEAFCOUNT() #undef TENSORIMAGEPATCHOPLEAFCOUNT +// TensorVolumePatchOp +#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\ +template\ +struct LeafCount >:CategoryCount{}; + + +TENSORVOLUMEPATCHOPLEAFCOUNT(const) +TENSORVOLUMEPATCHOPLEAFCOUNT() +#undef TENSORVOLUMEPATCHOPLEAFCOUNT } /// namespace TensorSycl } /// namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index ab97235ae..fcef0be04 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -235,6 +235,20 @@ SYCLTENSORIMAGEPATCHOP() #undef SYCLTENSORIMAGEPATCHOP + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOP(CVQual)\ +template\ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorVolumePatchOp::ArgType> Type;\ +}; + +SYCLTENSORVOLUMEPATCHOP(const) +SYCLTENSORVOLUMEPATCHOP() +#undef SYCLTENSORVOLUMEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template struct createPlaceHolderExpression { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 0ca2cac84..64474ee80 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -65,12 +65,8 @@ class TensorVolumePatchOp : public TensorBase, D CoordAccess = false, RawAccess = false }; +#ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_op(op) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -321,7 +321,9 @@ struct TensorEvaluator, D m_outputPlanesRows = m_outputPlanes * m_outputRows; // Fast representations of different variables. + // printf("THis is m_otherStride: %lu\n", m_otherStride ); m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); m_fastColStride = internal::TensorIntDivisor(m_colStride); m_fastRowStride = internal::TensorIntDivisor(m_rowStride); @@ -338,7 +340,6 @@ struct TensorEvaluator, D m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -352,6 +353,7 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; @@ -505,6 +507,8 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -600,6 +604,8 @@ struct TensorEvaluator, D Scalar m_paddingValue; TensorEvaluator m_impl; +// required by sycl + XprType m_op; }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 69c892362..508f29446 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -167,11 +167,12 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp new file mode 100644 index 000000000..ddc9e0d46 --- /dev/null +++ b/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp @@ -0,0 +1,222 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_volume_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + +IndexType sizeDim0 = 4; +IndexType sizeDim1 = 2; +IndexType sizeDim2 = 3; +IndexType sizeDim3 = 5; +IndexType sizeDim4 = 7; +array tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; +array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; +Tensor tensor_col_major(tensorColMajorRange); +Tensor tensor_row_major(tensorRowMajorRange); +tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_voxel_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); + gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); + + array patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; + Tensor single_voxel_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); + gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); + + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); + VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + } + + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); +} + +template +static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + const int depth = 4; + const int patch_z = 2; + const int patch_y = 3; + const int patch_x = 5; + const int batch = 7; + + array tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; + array tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; + Tensor entire_volume_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); + gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); + + +// Tensor tensor(depth, patch_z, patch_y, patch_x, batch); +// tensor.setRandom(); +// Tensor tensor_row_major = tensor.swap_layout(); + + //Tensor entire_volume_patch; + //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); + +// Tensor entire_volume_patch_row_major; + //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + + array patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; + Tensor entire_volume_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); + gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); + + const int dz = patch_z - 1; + const int dy = patch_y - 1; + const int dx = patch_x - 1; + + const int forward_pad_z = dz - dz / 2; + const int forward_pad_y = dy - dy / 2; + const int forward_pad_x = dx - dx / 2; + + for (int pz = 0; pz < patch_z; pz++) { + for (int py = 0; py < patch_y; py++) { + for (int px = 0; px < patch_x; px++) { + const int patchId = pz + patch_z * (py + px * patch_y); + for (int z = 0; z < patch_z; z++) { + for (int y = 0; y < patch_y; y++) { + for (int x = 0; x < patch_x; x++) { + for (int b = 0; b < batch; b++) { + for (int d = 0; d < depth; d++) { + float expected = 0.0f; + float expected_row_major = 0.0f; + const int eff_z = z - forward_pad_z + pz; + const int eff_y = y - forward_pad_y + py; + const int eff_x = x - forward_pad_x + px; + if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && + eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { + expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); + expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); + } + VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); + VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); +} + + + +template void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +std::cout << "Running on " << s.template get_info() << std::endl; +test_single_voxel_patch_sycl(sycl_device); +test_entire_volume_patch_sycl(sycl_device); +} +void test_cxx11_tensor_volume_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); +} +} -- cgit v1.2.3 From 8296b87d7bd98c19c6064241880691f164790ede Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 28 Feb 2017 17:16:14 +0000 Subject: Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used. --- .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 21 +-- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 30 ++-- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 6 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 23 ++- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 16 +- .../Tensor/TensorSyclConvertToDeviceExpression.h | 1 + .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 61 ++++++-- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 27 ++++ .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 43 +++++- .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h | 21 +-- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 20 +++ .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 27 ++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_custom_op_sycl.cpp | 165 +++++++++++++++++++++ unsupported/test/cxx11_tensor_forced_eval_sycl.cpp | 2 +- 15 files changed, 397 insertions(+), 67 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_custom_op_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index abc7ba551..fcd7d4d00 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -84,7 +84,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { + if (data) { evalTo(data); return false; } else { @@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; + size_t out_offset; Index roundUpK, M, N, K; ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; @@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS Device dev; - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), + out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), @@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS for (Index wLPTN=0; wLPTN(cgh, self.left_impl())) LHSTupleType; @@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo typedef cl::sycl::accessor RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - typedef cl::sycl::accessor OutAccessor; + typedef cl::sycl::accessor OutAccessor; //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); - + OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 4247c1c4a..66ffd819f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -32,14 +32,15 @@ internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize, range_x, range_y; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel1D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel2D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel3D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper EvalTo; EvalTo evalToTmp(local, m_kernelArg); @@ -325,6 +328,7 @@ struct TensorEvaluator InputFunctorExpr; // extract input functor list InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + ptrdiff_t out_offset = m_device.get_offset(data); m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { @@ -358,7 +362,7 @@ struct TensorEvaluator(global_range, local_range), EigenConvolutionKernel1D( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -383,7 +387,7 @@ struct TensorEvaluator(global_range, local_range), EigenConvolutionKernel2D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -412,7 +416,7 @@ struct TensorEvaluator( indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..c72d79435 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,6 +140,9 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap > result( @@ -295,6 +298,9 @@ struct TensorEvaluator > result(data, m_dimensions); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799bb..964222a15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -18,6 +18,8 @@ namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) + #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) + template class MemCopyFunctor { public: @@ -43,11 +45,12 @@ namespace Eigen { struct memsetkernelFunctor{ typedef cl::sycl::accessor AccType; AccType m_acc; + const ptrdiff_t buff_offset; const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} void operator()(cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; + if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; } }; @@ -305,6 +308,11 @@ struct SyclDevice { synchronize(); } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + auto it = m_queue_stream->find_buffer(ptr); + return (static_cast(ptr))-it->first; + + } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the @@ -343,20 +351,23 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); + auto it1 = m_queue_stream->find_buffer(static_cast(data)); + ptrdiff_t buff_offset= (static_cast(data)) - it1->first; + sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); synchronize(); } struct memsetCghFunctor{ cl::sycl::buffer& m_buf; + const ptrdiff_t& buff_offset; const size_t& rng , GRange, tileSize; const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { auto buf_acc = m_buf.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129e2..c9c7acfdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,9 +27,9 @@ namespace internal { template struct syclGenericBufferReducer{ template -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev template struct syclGenericBufferReducer, CoeffReturnType>{ template -static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), - bufOut, bufI, dev, length, local); + bufOut, out_offset, bufI, dev, length, local); } }; @@ -127,8 +127,9 @@ struct FullReducer { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); + ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -158,10 +159,11 @@ struct InnerReducer { // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(cgh, output); + ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index dd63a2e2f..9476c0ea8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -109,6 +109,7 @@ struct ConvertToDeviceExpression > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ }; + // TensorForcedEvalOp KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 117b368ec..af4eb5f13 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -236,8 +236,12 @@ EVALTO() template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - typedef CVQual TensorMap::Scalar,\ - TensorForcedEvalOp::NumDimensions, Eigen::internal::traits>::Layout, typename TensorForcedEvalOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef TensorForcedEvalOp XprType;\ + typedef CVQual TensorMap<\ + Tensor::Layout,typename XprType::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ @@ -248,6 +252,28 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + + +#define TENSORCUSTOMUNARYOP(CVQual)\ +template \ +struct ExprConstructor,\ +CVQual PlaceHolder, N>, Params...> {\ + typedef TensorCustomUnaryOp XprType;\ + typedef CVQual TensorMap<\ + Tensor::Layout,typename XprType::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ +}; + +TENSORCUSTOMUNARYOP(const) +TENSORCUSTOMUNARYOP() +#undef TENSORCUSTOMUNARYOP + template struct ValueCondition { static const size_t Res =X; }; @@ -260,7 +286,7 @@ template struct ValueCondition { template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - static const size_t NumIndices= ValueCondition< TensorReductionOp::NumDimensions==0, 1, TensorReductionOp::NumDimensions >::Res;\ + static const auto NumIndices= ValueCondition< TensorReductionOp::NumDimensions==0, 1, TensorReductionOp::NumDimensions >::Res;\ typedef CVQual TensorMap::Scalar,\ NumIndices, Eigen::internal::traits>::Layout, typename TensorReductionOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ Type expr;\ @@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR() /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp +#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits >::NumDimensions;\ - typedef CVQual TensorMap::Scalar,\ - NumIndices, Eigen::internal::traits >::Layout,\ - typename ExprNode::Index>,\ - Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef ExprNode XprTyp;\ + static const auto NumIndices= Eigen::internal::traits::NumDimensions;\ + typedef CVQual TensorMap<\ + Tensor::Layout, typename XprTyp::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ }; -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - +SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) +#undef SYCLCONTRACTCONVCUSBIOPS #define SYCLSLICEOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 4a6322d44..5a6a8f4c5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC +#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ +template \ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ +}; + + +SYCLCUSTOMUNARYOPEXTACC(const) +SYCLCUSTOMUNARYOPEXTACC() +#undef SYCLCUSTOMUNARYOPEXTACC + + +#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ +template \ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ +}; + +SYCLCUSTOMBINARYOPEXTACC(const) +SYCLCUSTOMBINARYOPEXTACC() +#undef SYCLCUSTOMBIBARYOPEXTACC + + + + /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template \ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 8828a0495..9fcac5ecb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -33,14 +33,17 @@ namespace internal { /// re-instantiate them on the device. /// We have to pass instantiated functors to the device. // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). +#define DEFALTACTION(Evaluator)\ +typedef typename Evaluator::Dimensions Dimensions;\ +const Dimensions m_dimensions;\ +EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ +FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} + template struct FunctorExtractor{ - typedef typename Evaluator::Dimensions Dimensions; - const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - FunctorExtractor(const Evaluator& expr) - : m_dimensions(expr.dimensions()) {} + DEFALTACTION(Evaluator) }; + /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything ///TensorConversionOp #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ @@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const) SYCLEXTRFUNCTERNARY() #undef SYCLEXTRFUNCTERNARY + + +//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different +//from the UnaryCategory and it is similar to the general FunctorExtractor. +/// specialisation of TensorCustomOp +#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ + typedef TensorEvaluator, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMUNARYOP(const) +SYCLEXTRFUNCCUSTOMUNARYOP() +#undef SYCLEXTRFUNCCUSTOMUNARYOP + + +#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ + typedef TensorEvaluator, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMBIBARYOP(const) +SYCLEXTRFUNCCUSTOMBIBARYOP() +#undef SYCLEXTRFUNCCUSTOMBIBARYOP + + + /// specialisation of the \ref FunctorExtractor struct when the node type is /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCSELECTOP(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 2f7779036..12237bfab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -21,11 +21,12 @@ namespace internal { template struct GenericKernelReducer{ OP op; OutputAccessor aOut; + ptrdiff_t out_offset; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -59,7 +60,7 @@ namespace internal { aI[itemID.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; + aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; } } } @@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression::Type DevExpr; @@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum; + output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -111,9 +113,9 @@ class ReductionFunctor::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; typedef Eigen::internal::SumReducer Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression::Type DevExpr; @@ -133,11 +135,12 @@ class ReductionFunctor::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; + output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 50f4595fc..330283b39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const) SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT +#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ +template \ +struct LeafCount > {\ +static const size_t Count =1;\ +}; + +SYCLCUSTOMUNARYOPLEAFCOUNT(const) +SYCLCUSTOMUNARYOPLEAFCOUNT() +#undef SYCLCUSTOMUNARYOPLEAFCOUNT + + +#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ +template \ +struct LeafCount > {\ +static const size_t Count =1;\ +}; +SYCLCUSTOMBINARYOPLEAFCOUNT( const) +SYCLCUSTOMBINARYOPLEAFCOUNT() +#undef SYCLCUSTOMBINARYOPLEAFCOUNT + /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp #define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ template \ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index fcef0be04..99d528963 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -143,6 +143,33 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMUNARYOPEVAL(CVQual)\ +template \ +struct PlaceHolderExpression, N> {\ + typedef CVQual PlaceHolder, N> Type;\ +}; + +CUSTOMUNARYOPEVAL(const) +CUSTOMUNARYOPEVAL() +#undef CUSTOMUNARYOPEVAL + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMBINARYOPEVAL(CVQual)\ +template \ +struct PlaceHolderExpression, N> {\ + typedef CVQual PlaceHolder, N> Type;\ +}; + +CUSTOMBINARYOPEVAL(const) +CUSTOMBINARYOPEVAL() +#undef CUSTOMBINARYOPEVAL + + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorEvalToOp, TensorLayoutSwapOp #define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 508f29446..996178292 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp new file mode 100644 index 000000000..9ff287fff --- /dev/null +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +template +struct InsertZeros { + DSizes dimensions(const TensorType& input) const { + DSizes result; + result[0] = input.dimension(0) * 2; + result[1] = input.dimension(1) * 2; + return result; + } + + template + void eval(const TensorType& input, Output& output, const Device& device) const + { + array strides; + strides[0] = 2; + strides[1] = 2; + output.stride(strides).device(device) = input; + + Eigen::DSizes offsets(1,1); + Eigen::DSizes extents(output.dimension(0)-1, output.dimension(1)-1); + output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f); + } +}; + +template +static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 5; + Eigen::array tensorRange = {{sizeDim1, sizeDim2}}; + Eigen::array tensorResultRange = {{6, 10}}; + + Eigen::Tensor in1(tensorRange); + Eigen::Tensor out(tensorResultRange); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(out.dimension(0), 6); + VERIFY_IS_EQUAL(out.dimension(1), 10); + + for (int i = 0; i < 6; i+=2) { + for (int j = 0; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2)); + } + } + for (int i = 1; i < 6; i+=2) { + for (int j = 1; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), 0); + } + } +} + +template +struct BatchMatMul { + DSizes dimensions(const TensorType& input1, const TensorType& input2) const { + DSizes result; + result[0] = input1.dimension(0); + result[1] = input2.dimension(1); + result[2] = input2.dimension(2); + return result; + } + + template + void eval(const TensorType& input1, const TensorType& input2, + Output& output, const Device& device) const + { + typedef typename TensorType::DimensionPair DimPair; + array dims; + dims[0] = DimPair(1, 0); + for (int64_t i = 0; i < output.dimension(2); ++i) { + output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims); + } + } +}; + +template +static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + + Eigen::array tensorRange1 = {{2, 3, 5}}; + Eigen::array tensorRange2 = {{3,7,5}}; + Eigen::array tensorResultRange = {{2, 7, 5}}; + + Eigen::Tensor in1(tensorRange1); + Eigen::Tensor in2(tensorRange2); + Eigen::Tensor out(tensorResultRange); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange1); + TensorType gpu_in2(gpu_in2_data, tensorRange2); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + in2.setRandom(); + + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); + + gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < 5; ++i) { + typedef typename Eigen::Tensor::DimensionPair DimPair; + array dims; + dims[0] = DimPair(1, 0); + Eigen::Tensor reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims); + TensorRef > val = out.template chip<2>(i); + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(val(j, k), reference(j, k)); + } + } + } +} + +template void custom_op_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_custom_unary_op_sycl(sycl_device); + test_custom_unary_op_sycl(sycl_device); + test_custom_binary_op_sycl(sycl_device); + test_custom_binary_op_sycl(sycl_device); + +} +void test_cxx11_tensor_custom_op_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(custom_op_perDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index aca036cde..a21514d56 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { Eigen::TensorMap> gpu_in2(gpu_in2_data, tensorRange); Eigen::TensorMap> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); -- cgit v1.2.3 From f84963ed95ff277bf3abb2e2517b3017a25ccf3f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Mar 2017 14:27:10 +0000 Subject: Adding TensorIndexTuple and TensorTupleReduceOP backend (ArgMax/Min) for sycl; fixing the address space issue for const TensorMap; converting all discard_write to write due to data missmatch. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 19 +- .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 146 +++++++++++ .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 268 +++++++++++++-------- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 + .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h | 12 + .../Tensor/TensorSyclConvertToDeviceExpression.h | 36 +-- .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 77 ++++-- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 48 ++-- .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 66 +++-- .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 34 +-- .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 38 ++- unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h | 3 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_argmax_sycl.cpp | 248 +++++++++++++++++++ 19 files changed, 813 insertions(+), 226 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h create mode 100644 unsupported/test/cxx11_tensor_argmax_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index d06f40cd8..e81001c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,6 +119,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { + return m_impl; + } + + protected: TensorEvaluator m_impl; }; @@ -222,7 +228,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { + m_return_dim(op.return_dim()), m_device(device) { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { @@ -252,7 +258,16 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } + #ifndef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + #else // following functions are required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} + const Device& device() const{return m_device;} + #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -292,6 +307,8 @@ struct TensorEvaluator, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; + // required by sycl + const Device& m_device; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h new file mode 100644 index 000000000..90cbe004f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorArgMaxSycl.h + * \brief: + * TensorArgMaxSycl + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +namespace Eigen { +namespace internal { + template + struct eval, Eigen::Dense> + { + typedef const TensorTupleReducerDeviceOp& type; + }; + + template + struct nested, 1, + typename eval >::type> + { + typedef TensorTupleReducerDeviceOp type; + }; + +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + + +}// end namespace internal +template +class TensorTupleReducerDeviceOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, + const int return_dim, + const StrideDims& strides, + const Index& stride_mod, const Index& stride_div) + :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + EIGEN_DEVICE_FUNC + const StrideDims& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const Index& stride_mod() const { return m_stride_mod; } + + EIGEN_DEVICE_FUNC + const Index& stride_div() const { return m_stride_div; } + + protected: + typename Eigen::internal::remove_all::type m_xpr; + const int m_return_dim; + const StrideDims& m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, SyclKernelDevice> +{ + typedef TensorTupleReducerDeviceOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Index Scalar; + typedef Index CoeffReturnType; + typedef typename XprType::CoeffReturnType TupleType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) + : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), + m_stride_div(op.stride_div()){} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } +typedef typename MakeGlobalPointer::CoeffReturnType >::Type ptr_Dev_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast(m_impl.data()); } + +protected: + TensorEvaluator m_impl; + const int m_return_dim; + const StrideDims& m_strides; + const Index& m_stride_mod; + const Index& m_stride_div; +}; +} // end namespace Eigen +#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index fcd7d4d00..5b4c3c5bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -11,7 +11,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclConvertToDeviceExpression.h + * TensorTensorContractionsycl.h * * \brief: * TensorContractionsycl @@ -389,9 +389,9 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor(lhs_functors, rhs_functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 66ffd819f..5db16d559 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -45,7 +45,7 @@ EigenConvolutionKernel1D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -103,7 +103,7 @@ EigenConvolutionKernel2D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -173,7 +173,7 @@ EigenConvolutionKernel3D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -339,8 +339,8 @@ struct TensorEvaluator(cgh, m_inputImpl); - typedef cl::sycl::accessor OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); + typedef cl::sycl::accessor OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); typedef cl::sycl::accessor KernelAccessorType; KernelAccessorType kernel_acc= m_device. template get_sycl_accessor(cgh, m_kernel); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 964222a15..258218463 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,9 +41,8 @@ namespace Eigen { size_t m_i; size_t m_offset; }; - +template struct memsetkernelFunctor{ - typedef cl::sycl::accessor AccType; AccType m_acc; const ptrdiff_t buff_offset; const size_t m_rng, m_c; @@ -55,15 +54,19 @@ namespace Eigen { }; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + ///FIXME: Currently there is a bug in amd cpu OpenCL auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); + //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. + }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){ + it=devices.erase(it); } else{ ++it; @@ -112,6 +115,154 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} +//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop +template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access(); + ::memcpy(host_acc.get_pointer(), src, n); +} + +template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + // Assuming that the dst is the start of the destination pointer +auto it =find_buffer(src); +auto offset =static_cast(static_cast(src))- it->first; +offset/=sizeof(Index); +size_t rng, GRange, tileSize; +parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access(cgh); + auto dst_acc =dest_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + +} + +EIGEN_STRONG_INLINE void synchronize() const { + std::lock_guard lock(mutex_); + m_queue.wait_and_throw(); //pass +} +EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled + std::lock_guard lock(mutex_); + m_queue.wait_and_throw(); //pass + +} + +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast(m_queue.get_device(). template get_info()); + auto s= m_queue.get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); + } + rng = n; + if (rng==0) rng=static_cast(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } +} + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } +} + + + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } +} + + +EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); +// return stream_->deviceProperties().multiProcessorCount; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); + +// return stream_->deviceProperties().maxThreadsPerBlock; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + std::lock_guard lock(mutex_); + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info(); +// return stream_->deviceProperties().maxThreadsPerMultiProcessor; +} +EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); +// return stream_->deviceProperties().sharedMemPerBlock; +} /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key @@ -119,10 +270,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + std::lock_guard lock(mutex_); auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access().get_pointer(); buf.set_final_data(nullptr); - std::lock_guard lock(mutex_); buffer_map.insert(std::pair>(static_cast(ptr),buf)); return static_cast(ptr); } @@ -193,48 +344,13 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(sycl_queue().get_device(). template get_info()); - auto s= sycl_queue().get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } + m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } @@ -242,39 +358,8 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { @@ -319,8 +404,7 @@ struct SyclDevice { /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + m_queue_stream->memcpyHostToDevice(dst,src,n); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -329,21 +413,7 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); + m_queue_stream->memcpyDeviceToHost(dst,src,n); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -366,8 +436,9 @@ struct SyclDevice { :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + auto buf_acc = m_buf.template get_access(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); } }; @@ -403,14 +474,13 @@ struct SyclDevice { EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass + m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - + m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -418,8 +488,10 @@ struct SyclDevice { return m_queue_stream->ok(); } }; - - +// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. +// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator +// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation +struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d6415817b..8516b37b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -193,7 +193,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifndef __SYCL_DEVICE_ONLY__ return loadConstant(m_data+index); +#else + CoeffReturnType tmp = m_data[index]; + return tmp; +#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d55..77c9c6c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -124,7 +124,9 @@ template struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { + #ifndef __SYCL_DEVICE_ONLY__ if (&rhs == this) return *this; + #endif first = rhs.first; second = rhs.second; return *this; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c9c7acfdc..94899252b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -35,7 +35,7 @@ static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access(h); - auto aOut =bufOut.template get_access(h); + auto aOut =bufOut.template get_access(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor LocalAccessor; @@ -158,7 +158,7 @@ struct InnerReducer { typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(cgh, output); + auto output_accessor = dev.template get_sycl_accessor(cgh, output); ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4c1..3d6270614 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -32,6 +32,8 @@ struct MakeLocalPointer { namespace Eigen { + template class TensorTupleReducerDeviceOp; + template struct TensorEvaluator, SyclKernelDevice>; namespace TensorSycl { namespace internal { @@ -48,6 +50,13 @@ template struct GetType{ typedef T Type; }; +template struct ValueCondition { + static const size_t Res =X; +}; +template struct ValueCondition { + static const size_t Res =Y; +}; + } } } @@ -80,6 +89,9 @@ template struct GetType{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +// TensorArgMaxSycl.h +#include "TensorArgMaxSycl.h" + /// this is used for extracting tensor convolution #include "TensorConvolutionSycl.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 9476c0ea8..d6ac7b91f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -103,7 +103,7 @@ KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp -#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ template \ struct ConvertToDeviceExpression > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ @@ -111,15 +111,17 @@ struct ConvertToDeviceExpression > {\ // TensorForcedEvalOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) // TensorLayoutSwapOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) -#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP - +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) +//TensorIndexTupleOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ @@ -132,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ +template \ +struct ConvertToDeviceExpression > {\ + typedef CVQual TensorTupleReducerOp::Type> Type;\ +}; + +KERNELBROKERCONVERTTUPLEREDUCTION(const) +KERNELBROKERCONVERTTUPLEREDUCTION() +#undef KERNELBROKERCONVERTTUPLEREDUCTION + +//TensorSlicingOp #define KERNELBROKERCONVERTSLICEOP(CVQual)\ template\ struct ConvertToDeviceExpression >{\ @@ -142,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const) KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP - +//TensorStridingSlicingOp #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ template\ struct ConvertToDeviceExpression >{\ @@ -153,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const) KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ template \ @@ -164,9 +177,6 @@ KERNELBROKERCONVERTCHIPPINGOP(const) KERNELBROKERCONVERTCHIPPINGOP() #undef KERNELBROKERCONVERTCHIPPINGOP - - - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ template\ @@ -188,8 +198,6 @@ KERNELBROKERCONVERTVOLUMEPATCHOP(const) KERNELBROKERCONVERTVOLUMEPATCHOP() #undef KERNELBROKERCONVERTVOLUMEPATCHOP - - } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index af4eb5f13..24cc23f45 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -65,7 +65,6 @@ CVQual PlaceHolder, N>, Params...>{\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())){}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP @@ -83,6 +82,7 @@ CVQual PlaceHolder &t)\ : expr(DeviceFixedSizeTensor::instantiate(utility::tuple::get(t))){}\ }; + TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE() #undef TENSORMAPFIXEDSIZE @@ -189,9 +189,6 @@ struct ExprConstructor, CVQual ASSIGN() #undef ASSIGN - - - /// specialisation of the \ref ExprConstructor struct when the node type is /// const TensorAssignOp #define CONVERSIONEXPRCONST(CVQual)\ @@ -252,8 +249,6 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL - - #define TENSORCUSTOMUNARYOP(CVQual)\ template \ struct ExprConstructor,\ @@ -274,13 +269,6 @@ TENSORCUSTOMUNARYOP(const) TENSORCUSTOMUNARYOP() #undef TENSORCUSTOMUNARYOP -template struct ValueCondition { - static const size_t Res =X; -}; -template struct ValueCondition { - static const size_t Res =Y; -}; - /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp #define SYCLREDUCTIONEXPR(CVQual)\ template \ @@ -299,6 +287,35 @@ SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp +/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. +#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ +template \ +struct ExprConstructor,\ +CVQual PlaceHolder, N>, Params...> {\ + static const auto NumRedDims= TensorReductionOp , MakeGlobalPointer>::NumDimensions;\ + static const auto NumIndices= ValueCondition::Res;\ +static const int Layout =static_cast(Eigen::internal::traits, MakeGlobalPointer>>::Layout);\ + typedef CVQual TensorMap<\ + Tensor::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp::Index>,\ + Layout,\ + MakeGlobalPointer\ + > XprType;\ + typedef typename TensorEvaluator , SyclKernelDevice>::Dimensions InputDimensions;\ + static const int NumDims = Eigen::internal::array_size::value;\ + typedef array StrideDims;\ + typedef const TensorTupleReducerDeviceOp Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get(t)), fd.dimensions()),\ + fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ + }\ +}; + +SYCLTUPLEREDUCTIONEXPR(const) +SYCLTUPLEREDUCTIONEXPR() +#undef SYCLTUPLEREDUCTIONEXPR /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp @@ -319,15 +336,18 @@ CVQual PlaceHolder, N>, Params :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ }; +//TensorContractionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +//TensorCustomBinaryOp SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) #undef SYCLCONTRACTCONVCUSBIOPS - +//TensorSlicingOp #define SYCLSLICEOPEXPR(CVQual)\ template\ struct ExprConstructor , CVQual TensorSlicingOp, Params... >{\ @@ -344,7 +364,7 @@ SYCLSLICEOPEXPR(const) SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR - +//TensorStridingSlicingOp #define SYCLSLICESTRIDEOPEXPR(CVQual)\ template\ struct ExprConstructor, CVQual TensorStridingSlicingOp, Params... >{\ @@ -361,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const) SYCLSLICESTRIDEOPEXPR() #undef SYCLSLICESTRIDEOPEXPR +//TensorReshapingOp and TensorShufflingOp #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template\ struct ExprConstructor , CVQual OPEXPR , Params... >{\ @@ -373,13 +394,15 @@ struct ExprConstructor , CVQual OPEXPR \ struct ExprConstructor , CVQual OPEXPR , Params... >{\ @@ -392,11 +415,11 @@ struct ExprConstructor , CVQual OPEXPR \ @@ -454,14 +477,12 @@ SYCLTENSORVOLUMEPATCHOPEXPR(const) SYCLTENSORVOLUMEPATCHOPEXPR() #undef SYCLTENSORVOLUMEPATCHOPEXPR - - -// TensorLayoutSwapOp -#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ +// TensorLayoutSwapOp and TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ template\ -struct ExprConstructor , CVQual TensorLayoutSwapOp, Params... >{\ +struct ExprConstructor , CVQual ExprNode, Params... >{\ typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorLayoutSwapOp Type;\ + typedef CVQual ExprNode Type;\ my_xpr_type xprExpr;\ Type expr;\ template \ @@ -469,10 +490,14 @@ struct ExprConstructor , CVQual TensorLa : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ }; -SYCLTENSORLAYOUTSWAPOPEXPR(const) -SYCLTENSORLAYOUTSWAPOPEXPR() -#undef SYCLTENSORLAYOUTSWAPOPEXPR +//TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR /// template deduction for \ref ExprConstructor struct template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 5a6a8f4c5..fb95af59e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -147,7 +147,7 @@ SYCLFORCEDEVALEXTACC(const) SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC - +//TensorCustomUnaryOp #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ @@ -160,7 +160,7 @@ SYCLCUSTOMUNARYOPEXTACC(const) SYCLCUSTOMUNARYOPEXTACC() #undef SYCLCUSTOMUNARYOPEXTACC - +//TensorCustomBinaryOp #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ @@ -172,9 +172,6 @@ SYCLCUSTOMBINARYOPEXTACC(const) SYCLCUSTOMBINARYOPEXTACC() #undef SYCLCUSTOMBIBARYOPEXTACC - - - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template \ @@ -188,15 +185,19 @@ SYCLEVALTOEXTACC() #undef SYCLEVALTOEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ +#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; +// TensorReductionOp +SYCLREDUCTIONEXTACC(const,TensorReductionOp) +SYCLREDUCTIONEXTACC(,TensorReductionOp) -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() +// TensorTupleReducerOp +SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) +SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) #undef SYCLREDUCTIONEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp @@ -206,14 +207,14 @@ template, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; - +//TensorContractionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) #undef SYCLCONTRACTIONCONVOLUTIONEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ @@ -252,7 +253,6 @@ SYCLTENSORCHIPPINGOPEXTACC(const) SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorImagePatchOp. #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ @@ -266,8 +266,6 @@ SYCLTENSORIMAGEPATCHOPEXTACC(const) SYCLTENSORIMAGEPATCHOPEXTACC() #undef SYCLTENSORIMAGEPATCHOPEXTACC - - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorVolumePatchOp. #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ @@ -281,21 +279,23 @@ SYCLTENSORVOLUMEPATCHOPEXTACC(const) SYCLTENSORVOLUMEPATCHOPEXTACC() #undef SYCLTENSORVOLUMEPATCHOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorLayoutSwapOp. -#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ +/// TensorLayoutSwapOp, TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ template\ -struct ExtractAccessor, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; -SYCLTENSORLAYOUTSWAPOPEXTACC(const) -SYCLTENSORLAYOUTSWAPOPEXTACC() -#undef SYCLTENSORLAYOUTSWAPOPEXTACC - +// TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC /// template deduction for \ref ExtractAccessor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 9fcac5ecb..942e9d307 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -126,19 +126,19 @@ struct FunctorExtractor, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomUnaryOp SYCLEXTRFUNCCUSTOMUNARYOP(const) SYCLEXTRFUNCCUSTOMUNARYOP() #undef SYCLEXTRFUNCCUSTOMUNARYOP - +//TensorCustomBinaryOp #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ template \ struct FunctorExtractor, Dev> > {\ typedef TensorEvaluator, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomBinaryOp SYCLEXTRFUNCCUSTOMBIBARYOP(const) SYCLEXTRFUNCCUSTOMBIBARYOP() #undef SYCLEXTRFUNCCUSTOMBIBARYOP @@ -177,7 +177,7 @@ SYCLEXTRFUNCASSIGNOP() /// specialisation of the \ref FunctorExtractor struct when the node types are /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ template \ struct FunctorExtractor, Dev> > {\ FunctorExtractor > xprExpr;\ @@ -185,13 +185,16 @@ struct FunctorExtractor, Dev> > {\ : xprExpr(expr.impl()) {}\ }; //TensorEvalToOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) // TensorLayoutSwapOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) +// TensorIndexTupleOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) -#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE template struct DimConstr { template @@ -202,10 +205,10 @@ template struct DimConstr { template static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast(dims.TotalSize()));} }; - +//TensorReductionOp #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ template class MakePointer_, typename Device>\ -struct FunctorExtractor, Device>>{\ +struct FunctorExtractor, Device> >{\ typedef TensorEvaluator, Device> Evaluator;\ typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ @@ -213,12 +216,39 @@ struct FunctorExtractor, Device>& expr)\ : m_dimensions(DimConstr::getDim(expr.dimensions())) {}\ }; - - SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +//TensorTupleReducerOp +#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ +template\ + struct FunctorExtractor, Device> >{\ + typedef TensorEvaluator, Device> Evaluator;\ + static const int NumOutputDims= Eigen::internal::traits >::NumDimensions;\ + typedef typename Evaluator::StrideDims StrideDims;\ + typedef typename Evaluator::Index Index;\ + typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ + const Dimensions m_dimensions;\ + const int m_return_dim;\ + const StrideDims m_strides;\ + const Index m_stride_mod;\ + const Index m_stride_div;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\ + FunctorExtractor(const TensorEvaluator, Device>& expr)\ + : m_dimensions(DimConstr::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ + m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ +}; + +SYCLEXTRFUNCTUPLEREDUCTIONOP(const) +SYCLEXTRFUNCTUPLEREDUCTIONOP() +#undef SYCLEXTRFUNCTUPLEREDUCTIONOP + +//TensorContractionOp and TensorConvolutionOp #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ template\ struct FunctorExtractor, Device>>{\ @@ -230,9 +260,10 @@ struct FunctorExtractor\ struct FunctorExtractor, Dev> >{\ @@ -273,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const) SYCLEXTRFUNCTSLICESTRIDEOP() #undef SYCLEXTRFUNCTSLICESTRIDEOP -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template\ struct FunctorExtractor, Dev> > {\ @@ -284,9 +316,11 @@ struct FunctorExtractor\ struct FunctorExtractor, Device> >{\ @@ -420,7 +455,6 @@ SYCLEXTRFUNCVOLUMEPATCHOP() #undef SYCLEXTRFUNCVOLUMEPATCHOP - /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 12237bfab..e5b892f2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -72,7 +72,7 @@ namespace internal { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - typedef cl::sycl::accessor write_accessor; + typedef cl::sycl::accessor write_accessor; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { @@ -85,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast(itemID.get_global_linear_id()); @@ -111,7 +111,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen class ReductionFunctor, Index> { public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - typedef cl::sycl::accessor write_accessor; + typedef cl::sycl::accessor write_accessor; typedef Eigen::internal::SumReducer Op; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer, Index range_, Index num_values_to_reduce_) @@ -126,8 +126,8 @@ class ReductionFunctor(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast(itemID.get_global_linear_id()); @@ -173,7 +173,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); @@ -220,7 +220,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); auto scale = (rng*red_factor) + remaining; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 330283b39..234580c7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -114,27 +114,37 @@ SYCLCUSTOMBINARYOPLEAFCOUNT() #undef SYCLCUSTOMBINARYOPLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ +#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ template \ struct LeafCount > {\ static const size_t Count = Num + CategoryCount::Count;\ }; -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) -#undef EVALTOLAYOUTSWAPLEAFCOUNT +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) + +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) + +#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ +#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ template \ -struct LeafCount > {\ +struct LeafCount > {\ static const size_t Count =1;\ }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() +// TensorReductionOp +REDUCTIONLEAFCOUNT(const,TensorReductionOp) +REDUCTIONLEAFCOUNT(,TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) +REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) + #undef REDUCTIONLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp @@ -150,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) #undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template \ @@ -161,7 +169,6 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp #define CHIPPINGOPLEAFCOUNT(CVQual)\ template \ @@ -195,7 +202,6 @@ TENSORIMAGEPATCHOPLEAFCOUNT() template\ struct LeafCount >:CategoryCount{}; - TENSORVOLUMEPATCHOPLEAFCOUNT(const) TENSORVOLUMEPATCHOPLEAFCOUNT() #undef TENSORVOLUMEPATCHOPLEAFCOUNT diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 99d528963..9d5708fc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -171,19 +171,24 @@ CUSTOMBINARYOPEVAL() /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp, TensorLayoutSwapOp -#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ +/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp +#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ template \ struct PlaceHolderExpression, N> {\ typedef CVQual ExprNode::ArgType> Type;\ }; -EVALTOLAYOUTSWAP(const, TensorEvalToOp) -EVALTOLAYOUTSWAP(, TensorEvalToOp) -EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) -EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) +// TensorEvalToOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) +//TensorLayoutSwapOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) +//TensorIndexTupleOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) -#undef EVALTOLAYOUTSWAP +#undef EVALTOLAYOUTSWAPINDEXTUPLE /// specialisation of the \ref PlaceHolderExpression when the node is @@ -199,17 +204,24 @@ CHIPPINGOP() #undef CHIPPINGOP /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLREDUCTION(CVQual)\ +/// TensorReductionOp and TensorTupleReducerOp (Argmax) +#define SYCLREDUCTION(CVQual, ExprNode)\ template \ -struct PlaceHolderExpression, N>{\ - typedef CVQual PlaceHolder, N> Type;\ +struct PlaceHolderExpression, N>{\ + typedef CVQual PlaceHolder, N> Type;\ }; -SYCLREDUCTION(const) -SYCLREDUCTION() + +// tensor reduction +SYCLREDUCTION(const, TensorReductionOp) +SYCLREDUCTION(, TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +SYCLREDUCTION(const, TensorTupleReducerOp) +SYCLREDUCTION(, TensorTupleReducerOp) #undef SYCLREDUCTION + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac785540..29c78184d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,7 +25,6 @@ namespace Eigen { namespace TensorSycl { - template struct ExecExprFunctorKernel{ typedef typename internal::createPlaceHolderExpression::Type PlaceHolderExpr; @@ -38,7 +37,7 @@ template struct ExecEx void operator()(cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); typename DevExpr::Index gId = static_cast(itemID.get_global_linear_id()); if (gId < range) device_evaluator.evalScalar(gId); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 996178292..4a558f856 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp new file mode 100644 index 000000000..9b22f1eca --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -0,0 +1,248 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template +static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ + + Tensor in(Eigen::array{{2,2,2}}); + Tensor out_max; + Tensor out_min; + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(1, 1, 1) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(DataType); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out_max = static_cast(sycl_device.allocate(out_bytes)); + DenseIndex* d_out_min = static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{2,2,2}}); + Eigen::TensorMap > gpu_out_max(d_out_max); + Eigen::TensorMap > gpu_out_min(d_out_min); + sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes); + + gpu_out_max.device(sycl_device) = gpu_in.argmax(); + gpu_out_min.device(sycl_device) = gpu_in.argmin(); + + sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); + sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); + + VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1); + VERIFY_IS_EQUAL(out_min(), 0); + + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out_max); + sycl_device.deallocate(d_out_min); +} + + +template +static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?-1.0:10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + + + + + +template +static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?1.0:-10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + + + + +template void sycl_argmax_test_per_device(const Device_Selector& d){ + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_simple_argmax(sycl_device); + test_sycl_simple_argmax(sycl_device); + test_sycl_argmax_dim(sycl_device); + test_sycl_argmax_dim(sycl_device); + test_sycl_argmin_dim(sycl_device); + test_sycl_argmin_dim(sycl_device); +} + +void test_cxx11_tensor_argmax_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device(device)); + } +} -- cgit v1.2.3 From e2e3f785331cb90ae07b7ca7829be0ffecf6811b Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Mar 2017 17:48:15 +0000 Subject: Fixing potential race condition on sycl device. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 502 +++++++++++---------- 1 file changed, 259 insertions(+), 243 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 258218463..23297a0a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,6 +41,7 @@ namespace Eigen { size_t m_i; size_t m_offset; }; + template struct memsetkernelFunctor{ AccType m_acc; @@ -54,6 +55,21 @@ template }; +struct memsetCghFunctor{ + cl::sycl::buffer& m_buf; + const ptrdiff_t& buff_offset; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + } +}; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); @@ -75,18 +91,8 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device return devices; } -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; +class QueueInterface { +public: /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template explicit QueueInterface(const dev_Selector& s): @@ -115,155 +121,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} -//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop -template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); -} - -template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - // Assuming that the dst is the start of the destination pointer -auto it =find_buffer(src); -auto offset =static_cast(static_cast(src))- it->first; -offset/=sizeof(Index); -size_t rng, GRange, tileSize; -parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - -} - -EIGEN_STRONG_INLINE void synchronize() const { - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass -} -EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass - -} - -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(m_queue.get_device(). template get_info()); - auto s= m_queue.get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } -} - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - -EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().multiProcessorCount; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); - -// return stream_->deviceProperties().maxThreadsPerBlock; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - std::lock_guard lock(mutex_); - // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); -// return stream_->deviceProperties().maxThreadsPerMultiProcessor; -} -EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().sharedMemPerBlock; -} - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -292,23 +149,208 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { std::lock_guard lock(mutex_); buffer_map.clear(); } + //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop + /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device + /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode + /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the + /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// this buffer is accessed, the data will be copied to the device. + template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access(); + ::memcpy(host_acc.get_pointer(), src, n); + } + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl + /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the + /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination + /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data + /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back + /// to the cpu only once per function call. + template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto it =find_buffer(src); + auto offset =static_cast(static_cast(src))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access(cgh); + auto dst_acc =dest_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + } - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + /// the memcpy function + template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; + auto it1 = find_buffer(static_cast(src)); + auto it2 = find_buffer(dst); + auto offset= (static_cast(static_cast(src))) - it1->first; + auto i= (static_cast(dst)) - it2->first; + offset/=sizeof(Index); + i/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =it1->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); + }); + synchronize(); + } + + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + std::lock_guard lock(mutex_); + size_t rng, GRange, tileSize; + parallel_for_setup(n, tileSize, rng, GRange); + auto it1 = find_buffer(static_cast(data)); + ptrdiff_t buff_offset= (static_cast(data)) - it1->first; + m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); + synchronize(); + } + + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + /// the function then adds an entry by creating a sycl buffer for that particular pointer. + template EIGEN_STRONG_INLINE cl::sycl::accessor + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + std::lock_guard lock(mutex_); + return (find_buffer(ptr)->second.template get_access(cgh)); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + std::lock_guard lock(mutex_); + return find_buffer(ptr)->second; + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(mutex_); + return (static_cast(ptr))-(find_buffer(ptr)->first); + } + + EIGEN_STRONG_INLINE void synchronize() const { + m_queue.wait_and_throw(); //pass + } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled + m_queue.wait_and_throw(); //pass + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast(m_queue.get_device(). template get_info()); + auto s= m_queue.get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); } - else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } + rng = n; + if (rng==0) rng=static_cast(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); } + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -320,25 +362,52 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { // destructor ~QueueInterface() { buffer_map.clear(); } + +private: + /// class members: + bool exception_caught_ = false; + + mutable std::mutex mutex_; + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// sycl queue + mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + auto it1 = buffer_map.find(static_cast(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } }; +// Here is a sycl deviuce struct which accept the sycl queue interface +// as an input struct SyclDevice { // class member. QueueInterface* m_queue_stream; /// QueueInterface is not owned. it is the caller's responsibility to destroy it. explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. + // get sycl accessor template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access(cgh)); + return m_queue_stream->template get_sycl_accessor(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + return m_queue_stream->get_sycl_buffer(ptr); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels @@ -353,8 +422,6 @@ struct SyclDevice { m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { @@ -375,72 +442,27 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + m_queue_stream->memcpy(dst,src,n); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - auto it = m_queue_stream->find_buffer(ptr); - return (static_cast(ptr))-it->first; + return m_queue_stream->get_offset(ptr); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. +// memcpyHostToDevice template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { m_queue_stream->memcpyHostToDevice(dst,src,n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. +/// here is the memcpyDeviceToHost template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { m_queue_stream->memcpyDeviceToHost(dst,src,n); } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = m_queue_stream->find_buffer(static_cast(data)); - ptrdiff_t buff_offset= (static_cast(data)) - it1->first; - sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + m_queue_stream->memset(data,c,n); } - - struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); - } - }; + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME @@ -449,37 +471,31 @@ struct SyclDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on sycl devices. return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().multiProcessorCount; + return m_queue_stream->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info(); - - // return stream_->deviceProperties().maxThreadsPerBlock; + return m_queue_stream->maxSyclThreadsPerBlock(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); + return m_queue_stream->maxSyclThreadsPerMultiProcessor(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().sharedMemPerBlock; + return m_queue_stream->sharedMemPerBlock(); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } EIGEN_STRONG_INLINE void synchronize() const { m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the -- cgit v1.2.3 From 5e9a1e7a7a7eccbb20a2c4eb44141727b0943f11 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 8 Mar 2017 14:17:48 +0000 Subject: Adding sycl Benchmarks. --- bench/tensors/README | 10 +- bench/tensors/tensor_benchmarks.h | 106 ++++++++++++++++++--- bench/tensors/tensor_benchmarks_sycl.cc | 73 ++++++++++++-- .../tensor_benchmarks_sycl_include_headers.cc | 2 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 33 ++++--- 5 files changed, 184 insertions(+), 40 deletions(-) create mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc diff --git a/bench/tensors/README b/bench/tensors/README index 3a5fdbe17..c4b742749 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu -To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code): +To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc +{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl +clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib +3. Run the benchmark +./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index c2fb3dede..325026113 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -35,6 +35,11 @@ template class BenchmarkSuite { void memcpy(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); @@ -55,7 +60,11 @@ template class BenchmarkSuite { } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.template cast(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.template cast(); @@ -70,7 +79,6 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); - StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -93,7 +101,18 @@ template class BenchmarkSuite { const Eigen::DSizes second_quadrant(0, m_/2); const Eigen::DSizes third_quadrant(m_/2, 0); const Eigen::DSizes fourth_quadrant(m_/2, m_/2); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.slice(first_quadrant, quarter_sizes).device(device_) = @@ -118,7 +137,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % k_, 0); @@ -135,7 +158,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % n_, 1); @@ -158,7 +185,11 @@ template class BenchmarkSuite { Eigen::array shuffle; shuffle[0] = 1; shuffle[1] = 0; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); @@ -186,7 +217,11 @@ template class BenchmarkSuite { paddings[0] = Eigen::IndexPair(0, 0); paddings[1] = Eigen::IndexPair(2, 1); #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.pad(paddings); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.pad(paddings); @@ -216,6 +251,11 @@ template class BenchmarkSuite { Eigen::IndexList, Eigen::type2index<2> > strides; #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.stride(strides); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); @@ -245,6 +285,11 @@ template class BenchmarkSuite { broadcast.set(1, n_); #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.broadcast(broadcast); @@ -261,7 +306,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); @@ -280,6 +329,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl +for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); +} +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); @@ -297,7 +351,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.exp() + B.log(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.exp() + B.log(); @@ -325,7 +383,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -355,7 +417,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -375,7 +441,11 @@ template class BenchmarkSuite { Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(); @@ -404,7 +474,11 @@ template class BenchmarkSuite { typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); @@ -430,7 +504,11 @@ template class BenchmarkSuite { Eigen::array dims; dims[0] = 0; dims[1] = 1; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.convolve(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 6df190869..cb6daac15 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -1,20 +1,73 @@ -#define EIGEN_USE_SYCL +#ifdef EIGEN_USE_SYCL #include #include #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +#endif diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc new file mode 100644 index 000000000..4b3110b85 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -0,0 +1,2 @@ +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc" +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 23297a0a7..5fa7423d0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -149,16 +149,27 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard lock(mutex_); buffer_map.clear(); } - //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// pointer created as a key we find the sycl buffer and get the host accessor with write mode + /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. + /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark + /// Also, this is faster as it uses the map_allocator instead of memcpy template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + auto it =find_buffer(dst); + auto offset =static_cast(static_cast(dst))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto src_buf = cl::sycl::buffer >(static_cast(static_cast(const_cast(src))), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto dst_acc= it->second.template get_access(cgh); + auto src_acc =src_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, offset, 0)); + }); + synchronize(); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -167,7 +178,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it =find_buffer(src); auto offset =static_cast(static_cast(src))- it->first; offset/=sizeof(Index); @@ -186,7 +196,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it1 = find_buffer(static_cast(src)); auto it2 = find_buffer(dst); auto offset= (static_cast(static_cast(src))) - it1->first; @@ -206,7 +215,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - std::lock_guard lock(mutex_); size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); auto it1 = find_buffer(static_cast(data)); @@ -220,18 +228,15 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the function then adds an entry by creating a sycl buffer for that particular pointer. template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - std::lock_guard lock(mutex_); return (find_buffer(ptr)->second.template get_access(cgh)); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - std::lock_guard lock(mutex_); return find_buffer(ptr)->second; } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard lock(mutex_); return (static_cast(ptr))-(find_buffer(ptr)->first); } @@ -375,7 +380,9 @@ private: mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + std::lock_guard lock(mutex_); auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; -- cgit v1.2.3 From aadb7405a7362ce0160d8ecb3843dc33a59e809a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 8 Mar 2017 18:20:06 +0000 Subject: Fixing typo in sycl Benchmark. --- bench/tensors/README | 4 ++-- bench/tensors/tensor_benchmarks_sycl_include_headers.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/tensors/README b/bench/tensors/README index c4b742749..69342cc9c 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -18,8 +18,8 @@ To compile and run the benchmark for SYCL, using ComputeCpp you currently need f 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. {ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o -clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib 3. Run the benchmark ./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc index 4b3110b85..bcc3c4c79 100644 --- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -1,2 +1,2 @@ -#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc" -#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl" +#include "tensor_benchmarks_sycl.cc" +#include "tensor_benchmarks_sycl.sycl" -- cgit v1.2.3 From 1b32a10053a942b1c6010afd719b44393b115d42 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 8 Mar 2017 18:26:34 +0000 Subject: Use name to distinguish name instead of the vendor --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 5fa7423d0..ed21d7b56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -76,7 +76,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device std::vector::iterator it =devices.begin(); while(it!=devices.end()) { ///FIXME: Currently there is a bug in amd cpu OpenCL - auto s= (*it).template get_info(); + auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); -- cgit v1.2.3 From f499fe9496e7c5e6f70d4bdcfb6ed9088795431a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 13 Mar 2017 09:18:37 +0000 Subject: Adding synchronisation to convolution kernel for sycl backend. --- bench/tensors/tensor_benchmarks.h | 5 +++++ unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + 2 files changed, 6 insertions(+) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 325026113..3a640ede4 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -539,6 +539,11 @@ for (int iter = 0; iter < 10; ++iter) { if (Eigen::internal::is_same::value) { device_.synchronize(); } +#elif defined(EIGEN_USE_SYCL) + if (Eigen::internal::is_same::value) { + device_.synchronize(); + } + #endif StopBenchmarkTiming(); SetBenchmarkFlopsProcessed(num_items); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 5db16d559..2e6021b1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -425,6 +425,7 @@ struct TensorEvaluator Date: Wed, 15 Mar 2017 19:26:08 +0000 Subject: Fixes bug in get_sycl_supported_devices() that was reporting unsupported Intel CPU on AMD platform - causing timeouts in that configuration --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index ed21d7b56..e9c3dc0a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -76,13 +76,16 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device std::vector::iterator it =devices.begin(); while(it!=devices.end()) { ///FIXME: Currently there is a bug in amd cpu OpenCL - auto s= (*it).template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); + auto name = (*it).template get_info(); + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + auto vendor = (*it).template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + + if((*it).is_cpu() && vendor.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs + it = devices.erase(it); //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. - }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){ - it=devices.erase(it); + }else if((*it).is_gpu() && name.find("intel")!=std::string::npos){ + it = devices.erase(it); } else{ ++it; -- cgit v1.2.3 From 9597d6f6aba6091f986fbe2348106dc96a24e34e Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 15 Mar 2017 19:28:09 +0000 Subject: Temporary: Disables cxx11_tensor_argmax_sycl test since it is causing zombie thread --- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 9b22f1eca..b80cb8f06 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -242,7 +242,9 @@ template void sycl_argmax_test_per_ } void test_cxx11_tensor_argmax_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { +// TODO {lukei}: re-enable once fixed +/* for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_argmax_test_per_device(device)); } +*/ } -- cgit v1.2.3 From a91417a7a5a210f424b8cfec4b2bc1e00aa340be Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Mon, 20 Mar 2017 14:48:54 +0000 Subject: Introduces align allocator for SYCL buffer --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 30 +++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e9c3dc0a0..c5142b7c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,6 +15,17 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +template > +struct SyclAllocator { + typedef Scalar value_type; + typedef typename std::allocator_traits::pointer pointer; + typedef typename std::allocator_traits::size_type size_type; + + SyclAllocator( ){}; + Scalar* allocate(std::size_t elements) { return static_cast(aligned_alloc(Align, elements)); } + void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } +}; + namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) @@ -56,11 +67,11 @@ template }; struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; + cl::sycl::buffer >& m_buf; const ptrdiff_t& buff_offset; const size_t& rng , GRange, tileSize; const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + memsetCghFunctor(cl::sycl::buffer >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { @@ -124,6 +135,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} + /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -131,10 +143,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { std::lock_guard lock(mutex_); - auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); + auto buf = cl::sycl::buffer >(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access().get_pointer(); buf.set_final_data(nullptr); - buffer_map.insert(std::pair>(static_cast(ptr),buf)); + buffer_map.insert(std::pair > >(static_cast(ptr),buf)); return static_cast(ptr); } @@ -235,7 +247,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { return find_buffer(ptr)->second; } @@ -380,18 +392,18 @@ private: /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; + mutable std::map > > buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + EIGEN_STRONG_INLINE std::map > >::iterator find_buffer(const void* ptr) const { std::lock_guard lock(mutex_); auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; } else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + for(std::map > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ auto size = it->second.get_size(); if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; } @@ -416,7 +428,7 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { return m_queue_stream->get_sycl_buffer(ptr); } -- cgit v1.2.3 From bd64ee8555559ee13f02f2921594b4bd224f9d00 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 28 Mar 2017 16:50:34 +0100 Subject: Fixing TensorArgMaxSycl.h; Removing warning related to the hardcoded type of dims to be int in Argmax. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 10 +++++----- .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 20 ++++++++++---------- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 10 +++++----- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 9 ++------- 7 files changed, 28 insertions(+), 33 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index e81001c6e..44b79e725 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -178,7 +178,7 @@ class TensorTupleReducerOp : public TensorBase, Devi EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } #else // following functions are required by sycl EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} @@ -303,7 +303,7 @@ struct TensorEvaluator, Devi protected: TensorEvaluator, Device> m_orig_impl; TensorEvaluator >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 90cbe004f..8f76b8254 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -61,9 +61,9 @@ class TensorTupleReducerDeviceOp : public TensorBase::type m_xpr; - const int m_return_dim; - const StrideDims& m_strides; + const Index m_return_dim; + const StrideDims m_strides; const Index m_stride_mod; const Index m_stride_div; }; @@ -137,10 +137,10 @@ typedef typename MakeGlobalPointer m_impl; - const int m_return_dim; - const StrideDims& m_strides; - const Index& m_stride_mod; - const Index& m_stride_div; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; }; } // end namespace Eigen #endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe340820..5b1235826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -619,7 +619,7 @@ class TensorBase const array, const Derived> argmax() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, @@ -632,7 +632,7 @@ class TensorBase const array, const Derived> argmin() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, @@ -643,7 +643,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -656,7 +656,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index cead2eac8..1a105165d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -268,7 +268,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; // required by sycl - const PatchDim& patch_dims; + const PatchDim patch_dims; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e9b..597f3f9ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -791,7 +791,7 @@ static const bool RunningOnGPU = false; typename MakePointer_::Type m_result; const Device& m_device; - const Dims& m_xpr_dims; + const Dims m_xpr_dims; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 942e9d307..a7905706d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -230,15 +230,15 @@ template\ typedef typename Evaluator::Index Index;\ typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ - const int m_return_dim;\ + const Index m_return_dim;\ const StrideDims m_strides;\ const Index m_stride_mod;\ const Index m_stride_div;\ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\ - EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\ - EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\ - EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\ + EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\ FunctorExtractor(const TensorEvaluator, Device>& expr)\ : m_dimensions(DimConstr::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index b80cb8f06..521a7f82c 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -143,10 +143,6 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) } } - - - - template static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) { @@ -242,9 +238,8 @@ template void sycl_argmax_test_per_ } void test_cxx11_tensor_argmax_sycl() { -// TODO {lukei}: re-enable once fixed -/* for (const auto& device :Eigen::get_sycl_supported_devices()) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_argmax_test_per_device(device)); } -*/ + } -- cgit v1.2.3 From 73fcaa319fcd12328e0577042862db471488fd5c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Mar 2017 08:22:25 -0700 Subject: Gate the sycl specific code under #ifdef sycl --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 9 +++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h | 10 ++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 ++++---- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 44b79e725..85facfb64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -228,7 +228,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()), m_device(device) { + m_return_dim(op.return_dim()) +#ifdef EIGEN_USE_SYCL + ,m_device(device) +#endif + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { @@ -307,8 +311,9 @@ struct TensorEvaluator, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; - // required by sycl +#ifdef EIGEN_USE_SYCL const Device& m_device; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index c72d79435..7e4c129bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,8 +140,9 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { @@ -298,8 +299,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } - +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -183,8 +182,9 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; array m_strides; Generator m_generator; - // required by sycl +#ifdef EIGEN_USE_SYCL TensorEvaluator m_argImpl; +#endif }; } // end namespace Eigen -- cgit v1.2.3 From e2d5d4e7b3ac51152e26de2d00eabb26ee2a4454 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Mar 2017 08:26:13 -0700 Subject: Restore the old constructors to retain compatibility with non c++11 compilers. --- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2fb6b84b9..0ac697087 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,8 +70,12 @@ class TensorImagePatchOp : public TensorBase Date: Fri, 31 Mar 2017 08:31:28 -0700 Subject: Restored code compatibility with compilers that dont support c++11 Gated more sycl code under #ifdef sycl --- unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 9 ++++----- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 16 ++++++++++------ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index b6bf05fed..af6ecf5f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -215,11 +215,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } - +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } +#endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 1a105165d..415021510 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -255,10 +255,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } +#endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 64474ee80..33eb1b297 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -64,9 +64,13 @@ class TensorVolumePatchOp : public TensorBase Date: Tue, 4 Apr 2017 09:47:04 -0700 Subject: Fixed compilation error when sycl is enabled. --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 220bd7ad0..11ae21be9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -97,9 +97,13 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()), m_argImpl(op.expression(), device) + : m_generator(op.generator()) +#ifdef EIGEN_USE_SYCL + , m_argImpl(op.expression(), device) +#endif { - m_dimensions = m_argImpl.dimensions(); + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; -- cgit v1.2.3 From 63840d4666f5f92fd235ef30862db06706e928b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:54:31 -0700 Subject: iGate the sycl specific code under a EIGEN_USE_SYCL define --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 85facfb64..c0f33ba2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,11 +119,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - // required by sycl +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - +#endif protected: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0ac697087..e1d5541bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,7 +70,7 @@ class TensorImagePatchOp : public TensorBase, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -423,9 +426,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - // required by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -506,8 +510,10 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; TensorEvaluator m_impl; - // required for sycl + +#ifdef EIGEN_USE_SYCL const XprType& m_op; +#endif }; -- cgit v1.2.3 From e3e343390adf05e091b605bf4975c6e9f495e8a1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:56:33 -0700 Subject: Guard the sycl specific code with a #ifdef EIGEN_USE_SYCL --- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 415021510..6e8e7885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -99,11 +99,14 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), patch_dims(op.patch_dims()) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_patch_dims(op.patch_dims()) +#endif { Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - // const PatchDim& patch_dims = op.patch_dims(); + const PatchDim& patch_dims = op.patch_dims(); if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims-1; ++i) { m_dimensions[i] = patch_dims[i]; @@ -257,7 +260,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } #endif protected: @@ -267,8 +270,10 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; - // required by sycl - const PatchDim patch_dims; + +#ifdef EIGEN_USE_SYCL + const PatchDim m_patch_dims; +#endif }; } // end namespace Eigen -- cgit v1.2.3 From 66c63826bdd550da08e86efd31c7cf0b793b1526 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:59:09 -0700 Subject: Guard the sycl specific code with EIGEN_USE_SYCL --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 597f3f9ae..7356334e1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -421,7 +421,10 @@ struct TensorEvaluator, static const bool RunningFullReduction = (NumOutputDims==0); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) +#if defined(EIGEN_USE_SYCL) + , m_xpr_dims(op.dims()) +#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -675,13 +678,12 @@ struct TensorEvaluator, } EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} +#if defined(EIGEN_USE_SYCL) + const TensorEvaluator& impl() const { return m_impl; } + const Device& device() const { return m_device; } + const Dims& xprDims() const { return m_xpr_dims; } +#endif private: template friend struct internal::GenericDimReducer; @@ -791,7 +793,10 @@ static const bool RunningOnGPU = false; typename MakePointer_::Type m_result; const Device& m_device; + +#if defined(EIGEN_USE_SYCL) const Dims m_xpr_dims; +#endif }; } // end namespace Eigen -- cgit v1.2.3 From a1304b95b7b3d667a7f04f1a75bd9427b2a59474 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:00:46 -0700 Subject: Code cleanup --- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 33eb1b297..ea404242d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -325,7 +325,6 @@ struct TensorEvaluator, D m_outputPlanesRows = m_outputPlanes * m_outputRows; // Fast representations of different variables. - // printf("THis is m_otherStride: %lu\n", m_otherStride ); m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); -- cgit v1.2.3 From a5a0c8fac1d86b61501199c79d4cde36f9e22305 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:03:21 -0700 Subject: Guard sycl specific code under a EIGEN_USE_SYCL ifdef --- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index ea404242d..f7b28cf66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -64,8 +64,8 @@ class TensorVolumePatchOp : public TensorBase, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) #endif - : m_impl(op.expression(), device), m_op(op) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -510,8 +513,10 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } - // required by sycl + +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -607,8 +612,10 @@ struct TensorEvaluator, D Scalar m_paddingValue; TensorEvaluator m_impl; -// required by sycl + +#ifdef EIGEN_USE_SYCL XprType m_op; +#endif }; -- cgit v1.2.3 From c302ea7bc417ef479626266e15bff59a805e305f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:05:16 -0700 Subject: Deleted empty line of code --- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index f7b28cf66..b0f970ae6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -359,7 +359,6 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; -- cgit v1.2.3 From 068cc0970890b534d65dbc99e6b5795acbaaa801 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:09:10 -0700 Subject: Preserve file naming conventions --- unsupported/test/CMakeLists.txt | 4 +- .../test/cxx11_tensor_image_patchOP_sycl.cpp | 1092 -------------------- unsupported/test/cxx11_tensor_image_patch_sycl.cpp | 1092 ++++++++++++++++++++ .../test/cxx11_tensor_volume_patchOP_sycl.cpp | 222 ---- .../test/cxx11_tensor_volume_patch_sycl.cpp | 222 ++++ 5 files changed, 1316 insertions(+), 1316 deletions(-) delete mode 100644 unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_image_patch_sycl.cpp delete mode 100644 unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_volume_patch_sycl.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 4a558f856..cdf151f15 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -171,8 +171,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_volume_patcP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp deleted file mode 100644 index e5ca4e388..000000000 --- a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp +++ /dev/null @@ -1,1092 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_image_patchOP_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" -#include - -using Eigen::Tensor; -static const int DataLayout = ColMajor; - -template -static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; - Tensor tensor_col_major(tensorColMajorRange); - Tensor tensor_row_major(tensorRowMajorRange); - tensor_col_major.setRandom(); - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); - - VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); - - // Single pixel patch: ColMajor - array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; - Tensor single_patch_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); - gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); - sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); - - // Single pixel patch: RowMajor - array patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; - Tensor single_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); - gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); - sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); - - for (IndexType i = 0; i < tensor_col_major.size(); ++i) { - // ColMajor - if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { - std::cout << "Mismatch detected at index colmajor " << i << " : " - << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] - << std::endl; - } - VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); - // RowMajor - if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { - std::cout << "Mismatch detected at index row major" << i << " : " - << tensor_row_major.data()[i] << " vs " - << single_patch_row_major.data()[i] << std::endl; - } - VERIFY_IS_EQUAL(single_patch_row_major.data()[i], - tensor_row_major.data()[i]); - VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); - VERIFY_IS_EQUAL(single_patch_col_major.data()[i], - single_patch_row_major.data()[i]); - } - - - // Entire image patch: ColMajor - patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; - Tensor entire_image_patch_col_major(patchColMajorTensorRange); - patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); - gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); - sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); - - // Entire image patch: RowMajor - patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; - Tensor entire_image_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); - gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); - sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); - - for (IndexType i = 0; i < 3; ++i) { - for (IndexType j = 0; j < 5; ++j) { - IndexType patchId = i+3*j; - for (IndexType r = 0; r < 3; ++r) { - for (IndexType c = 0; c < 5; ++c) { - for (IndexType d = 0; d < 2; ++d) { - for (IndexType b = 0; b < 7; ++b) { - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { - expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); - expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); - } - // ColMajor - if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); - // RowMajor - if (entire_image_patch_row_major(b, patchId, c, r, d) != - expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j - << " r=" << r << " c=" << c << " d=" << d << " b=" << b - << std::endl; - } - VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), - expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - } - - // 2D patch: ColMajor - patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; - Tensor twod_patch_col_major(patchColMajorTensorRange); - patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); - gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); - sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); - - // 2D patch: RowMajor - patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; - Tensor twod_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); - gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); - sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); - - - // Based on the calculation described in TensorTraits.h, padding happens to be 0. - IndexType row_padding = 0; - IndexType col_padding = 0; - IndexType stride = 1; - - for (IndexType i = 0; i < 3; ++i) { - for (IndexType j = 0; j < 5; ++j) { - IndexType patchId = i+3*j; - for (IndexType r = 0; r < 2; ++r) { - for (IndexType c = 0; c < 2; ++c) { - for (IndexType d = 0; d < 2; ++d) { - for (IndexType b = 0; b < 7; ++b) { - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - IndexType row_offset = r*stride + i - row_padding; - IndexType col_offset = c*stride + j - col_padding; - // ColMajor - if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { - expected_col_major = tensor_col_major(d, row_offset, col_offset, b); - } - if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); - - // RowMajor - if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { - expected_row_major = tensor_row_major(b, col_offset, row_offset, d); - - } - if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - } - - sycl_device.deallocate(gpu_data_col_major); - sycl_device.deallocate(gpu_data_row_major); - sycl_device.deallocate(gpu_data_single_patch_col_major); - sycl_device.deallocate(gpu_data_single_patch_row_major); - sycl_device.deallocate(gpu_data_entire_image_patch_col_major); - sycl_device.deallocate(gpu_data_entire_image_patch_row_major); - sycl_device.deallocate(gpu_data_twod_patch_col_major); - sycl_device.deallocate(gpu_data_twod_patch_row_major); - -} - - -// Verifies VALID padding (no padding) with incrementing values. -template -static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ - IndexType input_depth = 3; - IndexType input_rows = 3; - IndexType input_cols = 3; - IndexType input_batches = 1; - IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. - IndexType stride = 2; // Only same stride is supported. - - array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; - array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; - Tensor tensor_col_major(tensorColMajorRange); - Tensor tensor_row_major(tensorRowMajorRange); - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); - - VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); - - // Initializes tensor with incrementing numbers. - for (IndexType i = 0; i < tensor_col_major.size(); ++i) { - tensor_col_major.data()[i] = i + 1; - } - // ColMajor - array patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; - Tensor result_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); - DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); - gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); - sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth - VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows - VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols - VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches - VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches - - // RowMajor - array patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; - Tensor result_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =result_row_major.size()*sizeof(DataType); - DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); - gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); - sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); - VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); - VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); - VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); - VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); - - // No padding is carried out. - IndexType row_padding = 0; - IndexType col_padding = 0; - - for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows - for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols - IndexType patchId = i+input_rows*j; - for (IndexType r = 0; r < ksize; ++r) { // patch rows - for (IndexType c = 0; c < ksize; ++c) { // patch cols - for (IndexType d = 0; d < input_depth; ++d) { // depth - for (IndexType b = 0; b < input_batches; ++b) { // batch - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - IndexType row_offset = r + i - row_padding; - IndexType col_offset = c + j - col_padding; - if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { - expected_col_major = tensor_col_major(d, row_offset, col_offset, b); - expected_row_major = tensor_row_major(b, col_offset, row_offset, d); - } - // ColMajor - if (result_col_major(d, r, c, patchId, b) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); - // RowMajor - if (result_row_major(b, patchId, c, r, d) != expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - } - sycl_device.deallocate(gpu_data_col_major); - sycl_device.deallocate(gpu_data_row_major); - sycl_device.deallocate(gpu_data_result_col_major); - sycl_device.deallocate(gpu_data_result_row_major); -} - -// Verifies VALID padding (no padding) with the same value. -template -static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ - IndexType input_depth = 1; - IndexType input_rows = 5; - IndexType input_cols = 5; - IndexType input_batches = 2; - IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. - IndexType stride = 2; // Only same stride is supported. - // ColMajor - - array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; - array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; - Tensor tensor_col_major(tensorColMajorRange); - Tensor tensor_row_major(tensorRowMajorRange); - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); - - array patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; - Tensor result_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); - DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); - gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); - sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth - VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows - VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols - VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches - VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches - - // RowMajor - array patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; - Tensor result_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =result_row_major.size()*sizeof(DataType); - DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); - gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); - sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); - VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); - VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); - VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); - VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); - - // No padding is carried out. - IndexType row_padding = 0; - IndexType col_padding = 0; - - for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows - for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols - IndexType patchId = i+input_rows*j; - for (IndexType r = 0; r < ksize; ++r) { // patch rows - for (IndexType c = 0; c < ksize; ++c) { // patch cols - for (IndexType d = 0; d < input_depth; ++d) { // depth - for (IndexType b = 0; b < input_batches; ++b) { // batch - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - IndexType row_offset = r + i - row_padding; - IndexType col_offset = c + j - col_padding; - if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { - expected_col_major = tensor_col_major(d, row_offset, col_offset, b); - expected_row_major = tensor_row_major(b, col_offset, row_offset, d); - } - // ColMajor - if (result_col_major(d, r, c, patchId, b) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); - // RowMajor - if (result_row_major(b, patchId, c, r, d) != expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - } -} - -// Verifies SAME padding. -template -static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ - IndexType input_depth = 3; - IndexType input_rows = 4; - IndexType input_cols = 2; - IndexType input_batches = 1; - IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. - IndexType stride = 2; // Only same stride is supported. - - // ColMajor - array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; - array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; - Tensor tensor_col_major(tensorColMajorRange); - Tensor tensor_row_major(tensorRowMajorRange); - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); - - VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); - - // Initializes tensor with incrementing numbers. - for (IndexType i = 0; i < tensor_col_major.size(); ++i) { - tensor_col_major.data()[i] = i + 1; - } - -array patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; -Tensor result_col_major(patchColMajorTensorRange); -size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); -DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); -TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); -gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); -sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); - - - VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth - VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows - VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols - VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches - VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches - - // RowMajor - - array patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; - Tensor result_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =result_row_major.size()*sizeof(DataType); - DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); - gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); - sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); - VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); - VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); - VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); - VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); - - // Based on the calculation described in TensorTraits.h, padding happens to be 0. - IndexType row_padding = 0; - IndexType col_padding = 0; - - for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows - for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols - IndexType patchId = i+input_rows*j; - for (IndexType r = 0; r < ksize; ++r) { // patch rows - for (IndexType c = 0; c < ksize; ++c) { // patch cols - for (IndexType d = 0; d < input_depth; ++d) { // depth - for (IndexType b = 0; b < input_batches; ++b) { // batch - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - IndexType row_offset = r*stride + i - row_padding; - IndexType col_offset = c*stride + j - col_padding; - if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { - expected_col_major = tensor_col_major(d, row_offset, col_offset, b); - expected_row_major = tensor_row_major(b, col_offset, row_offset, d); - } - // ColMajor - if (result_col_major(d, r, c, patchId, b) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); - // RowMajor - if (result_row_major(b, patchId, c, r, d) != expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - } -} - - -template -static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ - - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - - // ColMajor - array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - array tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; - Tensor tensor_col_major(tensorColMajorRange); - tensor_col_major.setRandom(); - Tensor tensor_row_major(tensorRowMajorRange); - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); - - VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); - VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); - - - // Single pixel patch: ColMajor - array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; - Tensor single_patch_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); - gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); - sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); - - // Single pixel patch: RowMajor - array patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; - Tensor single_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); - gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); - sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); - - for (IndexType i = 0; i < tensor_col_major.size(); ++i) { - // ColMajor - if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { - std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; - } - VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); - // RowMajor - if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { - std::cout << "Mismatch detected at index " << i << " : " - << tensor_col_major.data()[i] << " vs " - << single_patch_row_major.data()[i] << std::endl; - } - VERIFY_IS_EQUAL(single_patch_row_major.data()[i], - tensor_row_major.data()[i]); - VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); - VERIFY_IS_EQUAL(single_patch_col_major.data()[i], - single_patch_row_major.data()[i]); - } - - // Entire image patch: ColMajor - patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; - Tensor entire_image_patch_col_major(patchColMajorTensorRange); - patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); - gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); - sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); - VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); - - // Entire image patch: RowMajor -patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; -Tensor entire_image_patch_row_major(patchRowMajorTensorRange); -patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); -DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); -TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); -gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); -sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); - VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); - - for (IndexType i = 0; i < 3; ++i) { - for (IndexType j = 0; j < 5; ++j) { - IndexType patchId = i+3*j; - for (IndexType r = 0; r < 3; ++r) { - for (IndexType c = 0; c < 5; ++c) { - for (IndexType d = 0; d < 2; ++d) { - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { - expected_col_major = tensor_col_major(d, r-1+i, c-2+j); - expected_row_major = tensor_row_major(c-2+j, r-1+i, d); - } - // ColMajor - if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; - } - VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); - // RowMajor - if (entire_image_patch_row_major(patchId, c, r, d) != - expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; - } - VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), - expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - - // 2D patch: ColMajor - patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; - Tensor twod_patch_col_major(patchColMajorTensorRange); - patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); - gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); - sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); - VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); - - // 2D patch: RowMajor - patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; - Tensor twod_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); - gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); - sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); - VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); - - // Based on the calculation described in TensorTraits.h, padding happens to be 0. - IndexType row_padding = 0; - IndexType col_padding = 0; - IndexType stride = 1; - - for (IndexType i = 0; i < 3; ++i) { - for (IndexType j = 0; j < 5; ++j) { - IndexType patchId = i+3*j; - for (IndexType r = 0; r < 2; ++r) { - for (IndexType c = 0; c < 2; ++c) { - for (IndexType d = 0; d < 2; ++d) { - DataType expected_col_major = 0.0f; - DataType expected_row_major = 0.0f; - IndexType row_offset = r*stride + i - row_padding; - IndexType col_offset = c*stride + j - col_padding; - // ColMajor - if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { - expected_col_major = tensor_col_major(d, row_offset, col_offset); - } - if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; - } - VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); - // RowMajor - if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { - expected_row_major = tensor_row_major(col_offset, row_offset, d); - } - if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; - } - VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected_col_major, expected_row_major); - } - } - } - } - } - - sycl_device.deallocate(gpu_data_col_major); - sycl_device.deallocate(gpu_data_row_major); - sycl_device.deallocate(gpu_data_single_patch_col_major); - sycl_device.deallocate(gpu_data_single_patch_row_major); - sycl_device.deallocate(gpu_data_entire_image_patch_col_major); - sycl_device.deallocate(gpu_data_entire_image_patch_row_major); - sycl_device.deallocate(gpu_data_twod_patch_col_major); - sycl_device.deallocate(gpu_data_twod_patch_row_major); -} - -template -static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) -{ - // Test the code on typical configurations used by the 'imagenet' benchmarks at - // https://github.com/soumith/convnet-benchmarks - // ColMajor - IndexType sizeDim1 = 3; - IndexType sizeDim2 = 128; - IndexType sizeDim3 = 128; - IndexType sizeDim4 = 16; - array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor l_in_col_major(tensorColMajorRange); - l_in_col_major.setRandom(); - - DataType* gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); - TensorMap> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); - - array patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; - Tensor l_out_col_major(patchTensorRange); - size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); - DataType* gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); - gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); - sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); - VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); - VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); - VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); - - // RowMajor - patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; - Tensor l_out_row_major(patchTensorRange); - patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); - DataType* gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); - gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); - sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); - VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); - VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); - VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); - VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); - - for (IndexType b = 0; b < 16; ++b) { - for (IndexType i = 0; i < 128; ++i) { - for (IndexType j = 0; j < 128; ++j) { - IndexType patchId = i+128*j; - for (IndexType c = 0; c < 11; ++c) { - for (IndexType r = 0; r < 11; ++r) { - for (IndexType d = 0; d < 3; ++d) { - DataType expected = 0.0f; - if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { - expected = l_in_col_major(d, r-5+i, c-5+j, b); - } - // ColMajor - if (l_out_col_major(d, r, c, patchId, b) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); - // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != - expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j - << " r=" << r << " c=" << c << " d=" << d << " b=" << b - << std::endl; - } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), - expected); - } - } - } - } - } - } - - // ColMajor - sycl_device.deallocate(gpu_data_l_in_col_major); - sycl_device.deallocate(gpu_data_l_out_col_major); - sizeDim1 = 16; - sizeDim2 = 64; - sizeDim3 = 64; - sizeDim4 = 32; - tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - l_in_col_major.resize(tensorColMajorRange); - l_in_col_major.setRandom(); - gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); - TensorMap>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); - - patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; - l_out_col_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); - gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); - sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); - gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); - sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); - VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); - VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); - VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); - VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); - VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); - -// RowMajor - sycl_device.deallocate(gpu_data_l_out_row_major); - patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; - l_out_row_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); - gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); - gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); - sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); - VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); - VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); - VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); - VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); - - for (IndexType b = 0; b < 32; ++b) { - for (IndexType i = 0; i < 64; ++i) { - for (IndexType j = 0; j < 64; ++j) { - IndexType patchId = i+64*j; - for (IndexType c = 0; c < 9; ++c) { - for (IndexType r = 0; r < 9; ++r) { - for (IndexType d = 0; d < 16; ++d) { - DataType expected = 0.0f; - if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { - expected = l_in_col_major(d, r-4+i, c-4+j, b); - } - // ColMajor - if (l_out_col_major(d, r, c, patchId, b) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); - // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); - } - } - } - } - } - } - - // ColMajor - - sycl_device.deallocate(gpu_data_l_in_col_major); - sycl_device.deallocate(gpu_data_l_out_col_major); - sizeDim1 = 32; - sizeDim2 = 16; - sizeDim3 = 16; - sizeDim4 = 32; - tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - l_in_col_major.resize(tensorColMajorRange); - l_in_col_major.setRandom(); - gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); - TensorMap>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); - - patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; - l_out_col_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); - gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); - sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); - gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); - sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); - VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); - VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); - VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); - VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); - - // RowMajor - sycl_device.deallocate(gpu_data_l_out_row_major); - patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; - l_out_row_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); - gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); - gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); - sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); - VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); - VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); - VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); - VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); - - for (IndexType b = 0; b < 32; ++b) { - for (IndexType i = 0; i < 16; ++i) { - for (IndexType j = 0; j < 16; ++j) { - IndexType patchId = i+16*j; - for (IndexType c = 0; c < 7; ++c) { - for (IndexType r = 0; r < 7; ++r) { - for (IndexType d = 0; d < 32; ++d) { - DataType expected = 0.0f; - if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { - expected = l_in_col_major(d, r-3+i, c-3+j, b); - } - // ColMajor - if (l_out_col_major(d, r, c, patchId, b) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); - // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); - } - } - } - } - } - } - - // ColMajor - sycl_device.deallocate(gpu_data_l_in_col_major); - sycl_device.deallocate(gpu_data_l_out_col_major); - sizeDim1 = 64; - sizeDim2 = 13; - sizeDim3 = 13; - sizeDim4 = 32; - tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - l_in_col_major.resize(tensorColMajorRange); - l_in_col_major.setRandom(); - gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); - TensorMap>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); - - patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; - l_out_col_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); - gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); - sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); - gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); - sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); - VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); - VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); - VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); - VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); - - // RowMajor - sycl_device.deallocate(gpu_data_l_out_row_major); - patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; - l_out_row_major.resize(patchTensorRange); - patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); - gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); - gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); - sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); - VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); - VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); - VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); - VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); - - for (IndexType b = 0; b < 32; ++b) { - for (IndexType i = 0; i < 13; ++i) { - for (IndexType j = 0; j < 13; ++j) { - IndexType patchId = i+13*j; - for (IndexType c = 0; c < 3; ++c) { - for (IndexType r = 0; r < 3; ++r) { - for (IndexType d = 0; d < 64; ++d) { - DataType expected = 0.0f; - if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { - expected = l_in_col_major(d, r-1+i, c-1+j, b); - } - // ColMajor - if (l_out_col_major(d, r, c, patchId, b) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); - // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected) { - std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; - } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); - } - } - } - } - } - } - sycl_device.deallocate(gpu_data_l_in_col_major); - sycl_device.deallocate(gpu_data_l_out_col_major); - sycl_device.deallocate(gpu_data_l_out_row_major); -} - - -template void sycl_tensor_image_patch_test_per_device(dev_Selector s){ -QueueInterface queueInterface(s); -auto sycl_device = Eigen::SyclDevice(&queueInterface); -test_simple_image_patch_sycl(sycl_device); -test_patch_padding_valid_sycl(sycl_device); -test_patch_padding_valid_same_value_sycl(sycl_device); -test_patch_padding_same_sycl(sycl_device); -test_patch_no_extra_dim_sycl(sycl_device); -test_imagenet_patches_sycl(sycl_device); -} -void test_cxx11_tensor_image_patchOP_sycl() -{ -for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); -} -} diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp new file mode 100644 index 000000000..e5ca4e388 --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp @@ -0,0 +1,1092 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_image_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index colmajor " << i << " : " + << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] + << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index row major" << i << " : " + << tensor_row_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); + + // Entire image patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; + Tensor entire_image_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); + gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); + expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(b, patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); + + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + } + if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); + + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + + } + if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); + +} + + +// Verifies VALID padding (no padding) with incrementing values. +template +static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 3; + IndexType input_cols = 3; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + // ColMajor + array patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_result_col_major); + sycl_device.deallocate(gpu_data_result_row_major); +} + +// Verifies VALID padding (no padding) with the same value. +template +static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 1; + IndexType input_rows = 5; + IndexType input_cols = 5; + IndexType input_batches = 2; + IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + // ColMajor + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + array patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + +// Verifies SAME padding. +template +static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 4; + IndexType input_cols = 2; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + // ColMajor + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + +array patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; +Tensor result_col_major(patchColMajorTensorRange); +size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); +DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); +gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); +sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + + array patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + + +template +static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + + // ColMajor + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + tensor_col_major.setRandom(); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); + + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " + << tensor_col_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + + // Entire image patch: RowMajor +patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; +Tensor entire_image_patch_row_major(patchRowMajorTensorRange); +patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); +DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); +gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); +sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j); + expected_row_major = tensor_row_major(c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset); + } + if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { + expected_row_major = tensor_row_major(col_offset, row_offset, d); + } + if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); +} + +template +static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + // ColMajor + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 128; + IndexType sizeDim3 = 128; + IndexType sizeDim4 = 16; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor l_in_col_major(tensorColMajorRange); + l_in_col_major.setRandom(); + + DataType* gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + + array patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; + Tensor l_out_col_major(patchTensorRange); + size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); + gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); + + // RowMajor + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; + Tensor l_out_row_major(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); + + for (IndexType b = 0; b < 16; ++b) { + for (IndexType i = 0; i < 128; ++i) { + for (IndexType j = 0; j < 128; ++j) { + IndexType patchId = i+128*j; + for (IndexType c = 0; c < 11; ++c) { + for (IndexType r = 0; r < 11; ++r) { + for (IndexType d = 0; d < 3; ++d) { + DataType expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in_col_major(d, r-5+i, c-5+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != + expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), + expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 16; + sizeDim2 = 64; + sizeDim3 = 64; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + +// RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 64; ++i) { + for (IndexType j = 0; j < 64; ++j) { + IndexType patchId = i+64*j; + for (IndexType c = 0; c < 9; ++c) { + for (IndexType r = 0; r < 9; ++r) { + for (IndexType d = 0; d < 16; ++d) { + DataType expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in_col_major(d, r-4+i, c-4+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 32; + sizeDim2 = 16; + sizeDim3 = 16; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 16; ++i) { + for (IndexType j = 0; j < 16; ++j) { + IndexType patchId = i+16*j; + for (IndexType c = 0; c < 7; ++c) { + for (IndexType r = 0; r < 7; ++r) { + for (IndexType d = 0; d < 32; ++d) { + DataType expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in_col_major(d, r-3+i, c-3+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 64; + sizeDim2 = 13; + sizeDim3 = 13; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 13; ++i) { + for (IndexType j = 0; j < 13; ++j) { + IndexType patchId = i+13*j; + for (IndexType c = 0; c < 3; ++c) { + for (IndexType r = 0; r < 3; ++r) { + for (IndexType d = 0; d < 64; ++d) { + DataType expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in_col_major(d, r-1+i, c-1+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sycl_device.deallocate(gpu_data_l_out_row_major); +} + + +template void sycl_tensor_image_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +test_simple_image_patch_sycl(sycl_device); +test_patch_padding_valid_sycl(sycl_device); +test_patch_padding_valid_same_value_sycl(sycl_device); +test_patch_padding_same_sycl(sycl_device); +test_patch_no_extra_dim_sycl(sycl_device); +test_imagenet_patches_sycl(sycl_device); +} +void test_cxx11_tensor_image_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); +} +} diff --git a/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp deleted file mode 100644 index ddc9e0d46..000000000 --- a/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_volume_patchOP_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" -#include - -using Eigen::Tensor; -static const int DataLayout = ColMajor; - -template -static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) -{ - -IndexType sizeDim0 = 4; -IndexType sizeDim1 = 2; -IndexType sizeDim2 = 3; -IndexType sizeDim3 = 5; -IndexType sizeDim4 = 7; -array tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; -array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; -Tensor tensor_col_major(tensorColMajorRange); -Tensor tensor_row_major(tensorRowMajorRange); -tensor_col_major.setRandom(); - - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - - - // single volume patch: ColMajor - array patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; - Tensor single_voxel_patch_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_single_voxel_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); - gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); - sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); - - - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); - VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); - - array patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; - Tensor single_voxel_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_single_voxel_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); - gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); - sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); - - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); - VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); - - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); - for (IndexType i = 0; i < tensor_col_major.size(); ++i) { - VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); - VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); - VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); - } - - - sycl_device.deallocate(gpu_data_col_major); - sycl_device.deallocate(gpu_data_row_major); - sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); - sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); -} - -template -static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) -{ - const int depth = 4; - const int patch_z = 2; - const int patch_y = 3; - const int patch_x = 5; - const int batch = 7; - - array tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; - array tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; - Tensor tensor_col_major(tensorColMajorRange); - Tensor tensor_row_major(tensorRowMajorRange); - tensor_col_major.setRandom(); - - - DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); - DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); - TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); - TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); - - sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); - gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); - sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); - - - // single volume patch: ColMajor - array patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; - Tensor entire_volume_patch_col_major(patchColMajorTensorRange); - size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); - DataType* gpu_data_entire_volume_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); - gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); - sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); - - -// Tensor tensor(depth, patch_z, patch_y, patch_x, batch); -// tensor.setRandom(); -// Tensor tensor_row_major = tensor.swap_layout(); - - //Tensor entire_volume_patch; - //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); - VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); - -// Tensor entire_volume_patch_row_major; - //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); - - array patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; - Tensor entire_volume_patch_row_major(patchRowMajorTensorRange); - patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); - DataType* gpu_data_entire_volume_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); - TensorMap> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); - gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); - sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); - - - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); - VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); - - const int dz = patch_z - 1; - const int dy = patch_y - 1; - const int dx = patch_x - 1; - - const int forward_pad_z = dz - dz / 2; - const int forward_pad_y = dy - dy / 2; - const int forward_pad_x = dx - dx / 2; - - for (int pz = 0; pz < patch_z; pz++) { - for (int py = 0; py < patch_y; py++) { - for (int px = 0; px < patch_x; px++) { - const int patchId = pz + patch_z * (py + px * patch_y); - for (int z = 0; z < patch_z; z++) { - for (int y = 0; y < patch_y; y++) { - for (int x = 0; x < patch_x; x++) { - for (int b = 0; b < batch; b++) { - for (int d = 0; d < depth; d++) { - float expected = 0.0f; - float expected_row_major = 0.0f; - const int eff_z = z - forward_pad_z + pz; - const int eff_y = y - forward_pad_y + py; - const int eff_x = x - forward_pad_x + px; - if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && - eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { - expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); - expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); - } - VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); - VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); - } - } - } - } - } - } - } - } - sycl_device.deallocate(gpu_data_col_major); - sycl_device.deallocate(gpu_data_row_major); - sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); - sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); -} - - - -template void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ -QueueInterface queueInterface(s); -auto sycl_device = Eigen::SyclDevice(&queueInterface); -std::cout << "Running on " << s.template get_info() << std::endl; -test_single_voxel_patch_sycl(sycl_device); -test_entire_volume_patch_sycl(sycl_device); -} -void test_cxx11_tensor_volume_patchOP_sycl() -{ -for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); -} -} diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp new file mode 100644 index 000000000..ddc9e0d46 --- /dev/null +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -0,0 +1,222 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_volume_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + +IndexType sizeDim0 = 4; +IndexType sizeDim1 = 2; +IndexType sizeDim2 = 3; +IndexType sizeDim3 = 5; +IndexType sizeDim4 = 7; +array tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; +array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; +Tensor tensor_col_major(tensorColMajorRange); +Tensor tensor_row_major(tensorRowMajorRange); +tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_voxel_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); + gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); + + array patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; + Tensor single_voxel_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); + gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); + + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); + VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + } + + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); +} + +template +static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + const int depth = 4; + const int patch_z = 2; + const int patch_y = 3; + const int patch_x = 5; + const int batch = 7; + + array tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; + array tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; + Tensor entire_volume_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); + gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); + + +// Tensor tensor(depth, patch_z, patch_y, patch_x, batch); +// tensor.setRandom(); +// Tensor tensor_row_major = tensor.swap_layout(); + + //Tensor entire_volume_patch; + //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); + +// Tensor entire_volume_patch_row_major; + //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + + array patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; + Tensor entire_volume_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); + gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); + + const int dz = patch_z - 1; + const int dy = patch_y - 1; + const int dx = patch_x - 1; + + const int forward_pad_z = dz - dz / 2; + const int forward_pad_y = dy - dy / 2; + const int forward_pad_x = dx - dx / 2; + + for (int pz = 0; pz < patch_z; pz++) { + for (int py = 0; py < patch_y; py++) { + for (int px = 0; px < patch_x; px++) { + const int patchId = pz + patch_z * (py + px * patch_y); + for (int z = 0; z < patch_z; z++) { + for (int y = 0; y < patch_y; y++) { + for (int x = 0; x < patch_x; x++) { + for (int b = 0; b < batch; b++) { + for (int d = 0; d < depth; d++) { + float expected = 0.0f; + float expected_row_major = 0.0f; + const int eff_z = z - forward_pad_z + pz; + const int eff_y = y - forward_pad_y + py; + const int eff_x = x - forward_pad_x + px; + if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && + eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { + expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); + expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); + } + VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); + VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); +} + + + +template void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +std::cout << "Running on " << s.template get_info() << std::endl; +test_single_voxel_patch_sycl(sycl_device); +test_entire_volume_patch_sycl(sycl_device); +} +void test_cxx11_tensor_volume_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); +} +} -- cgit v1.2.3