diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-02-01 15:29:53 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2017-02-01 15:29:53 +0000 |
commit | bab29936a1cf0a68ffe4ccb1fd9b4807a3ec87ae (patch) | |
tree | c750b36227a31ddb2a1e0d5fd11f0036fda775db | |
parent | 48a20b7d956433713a39e04d39cba443b7a763de (diff) |
Reducing warnings in Sycl backend.
19 files changed, 670 insertions, 676 deletions
diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 07ebed61b..27e5c9b1f 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -138,7 +138,7 @@ else() message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") endif() -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) +set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) # Check if the platform is supported execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index dc16f89e0..e87de0c57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -22,7 +22,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H namespace Eigen { -template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; +template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; template<typename Indices, typename LeftArgType, typename RightArgType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> : public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > { @@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, + LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); } @@ -162,8 +162,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, -int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN, -int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ +typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN, +typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr; @@ -224,84 +224,83 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); // Matmul Kernel // Thread identifiers - const int mLocalThreadId = itemID.get_local(0); // Local ID row - const int nLocalThreadId = itemID.get_local(1); // Local ID col - const int mGroupId = itemID.get_group(0); // Work-group ID row - const int nGroupId = itemID.get_group(1); // Work-group ID localCol - const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID + const Index mLocalThreadId = itemID.get_local(0); // Local ID row + const Index nLocalThreadId = itemID.get_local(1); // Local ID col + const Index mGroupId = itemID.get_group(0); // Work-group ID row + const Index nGroupId = itemID.get_group(1); // Work-group ID localCol + const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space float privateLhs; float privateRhs[WorkLoadPerThreadN]; float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { privateRes[wLPTM][wLPTN] = 0.0f; } } // Tile Lhs - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int - localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; + for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localLhsRow = localLhsLinearId% TileSizeDimM; + Index localLhsCol = localLhsLinearId/TileSizeDimM; // Load the value (wide vector load) - int GlobalLhsColId = TileSizeDimK*0 + localLhsCol; + Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol; localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0); } // Tile Rhs - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; + for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localRhsRow = localRhsLinearId% TileSizeDimN; + Index localRhsCol = localRhsLinearId/TileSizeDimN; // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; + Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0); } // Loop over all tiles - const int numTiles = roundUpK/TileSizeDimK; - int firstHalf=0; + const Index numTiles = roundUpK/TileSizeDimK; + Index firstHalf=0; do { // Synchronise itemID.barrier(cl::sycl::access::fence_space::local_space); // Load the next tile of Lhs and Rhs into local memory - int nextHalf = firstHalf + 1; + Index nextHalf = firstHalf + 1; if (nextHalf < numTiles) { // Tile A - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; + for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localLhsRow = localLhsLinearId% TileSizeDimM; + Index localLhsCol = localLhsLinearId/TileSizeDimM; // global K id - int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; + Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; // Store the loaded value into local memory localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0); } // Tile B - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; + for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localRhsRow = localRhsLinearId% TileSizeDimN; + Index localRhsCol = localRhsLinearId/TileSizeDimN; // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; + Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; // Store the loaded vector into local memory localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0); } } // Loop over the values of a single tile - for (int k=0; k<TileSizeDimK; k++) { + for (Index k=0; k<TileSizeDimK; k++) { // Cache the values of localRhs in registers - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; } // Perform the computation - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; } } @@ -311,11 +310,11 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr } while (firstHalf<numTiles); // Store the final results in C - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; if (globalRow< M){ - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; if(globalCol<N) out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; } @@ -325,24 +324,24 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr } }; -template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { - -static const int TileSizeDimM = 32; // Tile size for dimension M -static const int TileSizeDimN = 32; // Tile size for dimension N -static const int TileSizeDimK = 16; // Tile size for dimension K -static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M -static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N -static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression +template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { + +static const Index TileSizeDimM = 32ul; // Tile size for dimension M +static const Index TileSizeDimN = 32ul; // Tile size for dimension N +static const Index TileSizeDimK = 16ul; // Tile size for dimension K +static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M +static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N +static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) +static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) +static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression +static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression // RoundUp function to make sure that the global threadId is divisable by local threadId -static int RoundUp(int x, int y) { +static Index RoundUp(Index x, Index y) { return ((((x) + (y) - 1) / (y))*(y)); } -template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> +template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index e2569e1bf..c3e095b8a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -352,7 +352,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array<Index, 1> indices{m_indices[0]}; + const array<Index, 1> indices{{m_indices[0]}}; const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 9858d0560..e209799bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -194,7 +194,7 @@ struct SyclDevice { auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast<size_t>(256), static_cast<size_t>(tileSize)); + tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); } rng = n; if (rng==0) rng=static_cast<Index>(1); @@ -211,10 +211,10 @@ struct SyclDevice { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size)); + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } - size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size)); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2))); + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); rng1=dim1; if (rng1==0 ) rng1=static_cast<Index>(1); GRange1=rng1; @@ -241,10 +241,10 @@ struct SyclDevice { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size)); + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } - size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size)); - tileSize2 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/3))); + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); rng2=dim2; if (rng2==0 ) rng1=static_cast<Index>(1); GRange2=rng2; @@ -253,8 +253,8 @@ struct SyclDevice { Index xMode = static_cast<Index>(GRange2 % tileSize2); if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); } - pow_of_2 = static_cast<size_t>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2))); + pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); rng1=dim1; if (rng1==0 ) rng1=static_cast<Index>(1); GRange1=rng1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 94692be56..cac785540 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -50,10 +50,9 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. /// std::array does not have TotalSize. So I have to get the size through template specialisation. -template<typename Index, typename Dimensions> struct DimensionSize{ - static Index getDimSize(const Dimensions& dim){ +template<typename , typename Dimensions> struct DimensionSize{ + static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){ return dim.TotalSize(); - } }; #define DIMSIZEMACRO(CVQual)\ diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index c426549f1..21fdfca22 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -131,11 +131,6 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl:: std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - - test_broadcast_sycl_fixed<DataType, RowMajor, int>(sycl_device); - test_broadcast_sycl<DataType, RowMajor, int>(sycl_device); - test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device); - test_broadcast_sycl<DataType, ColMajor, int>(sycl_device); test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device); test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device); test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index d5193d1ea..400a31d09 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -32,20 +32,20 @@ template <typename T> T cube(T x) { return x * x * x; } template <typename T> T inverse(T x) { return 1 / x; } } -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR) \ +#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ { \ /* out OPERATOR in.FUNC() */ \ - Tensor<SCALAR, 3> in(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in = in.random() + static_cast<SCALAR>(0.01); \ out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data = static_cast<SCALAR *>( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ @@ -53,7 +53,7 @@ template <typename T> T inverse(T x) { return 1 / x; } gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(in(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -63,18 +63,18 @@ template <typename T> T inverse(T x) { return 1 / x; } } \ { \ /* out OPERATOR out.FUNC() */ \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ (out.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(reference(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -82,61 +82,62 @@ template <typename T> T inverse(T x) { return 1 / x; } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR) +#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC) \ +#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ { \ /* out = in.FUNC() */ \ - Tensor<SCALAR, 3> in(tensorRange); \ - Tensor<bool, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ + Tensor<bool, 3, Layout, int64_t> out(tensorRange); \ in = in.random() + static_cast<SCALAR>(0.01); \ SCALAR *gpu_data = static_cast<SCALAR *>( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ bool *gpu_data_out = \ static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<bool, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ + TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(bool)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ } \ sycl_device.deallocate(gpu_data); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS(SCALAR) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf) +#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float) + TEST_UNARY_BUILTINS(float, RowMajor) + TEST_UNARY_BUILTINS(float, ColMajor) } namespace std { @@ -144,24 +145,24 @@ template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); } template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC) \ +#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ { \ /* out = in_1.FUNC(in_2) */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> in_2(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -169,7 +170,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver = std::FUNC(in_1(i), in_2(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -179,24 +180,24 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR in_2 */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> in_2(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -204,7 +205,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ } \ sycl_device.deallocate(gpu_data_1); \ @@ -212,46 +213,48 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR 2 */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ } \ sycl_device.deallocate(gpu_data_1); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS(SCALAR) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /) +#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %) + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + TEST_BINARY_BUILTINS(float, RowMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) + TEST_BINARY_BUILTINS(float, ColMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) } void test_cxx11_tensor_builtins_sycl() { diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp index 5a324b44c..e3023a368 100644 --- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,39 +22,39 @@ using Eigen::Tensor; -template<typename DataType, int DataLayout, typename Index> +template<typename DataType, int DataLayout, typename IndexType> static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Index leftDim3 = 1; - Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Index rightDim3 = 1; - Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; - - //Index concatDim1 = 3; -// Index concatDim2 = 3; -// Index concatDim3 = 1; - //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; - - Tensor<DataType, 3, DataLayout, Index> left(leftRange); - Tensor<DataType, 3, DataLayout, Index> right(rightRange); + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + IndexType leftDim3 = 1; + Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + IndexType rightDim3 = 1; + Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; + + //IndexType concatDim1 = 3; +// IndexType concatDim2 = 3; +// IndexType concatDim3 = 1; + //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; + + Tensor<DataType, 3, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 3, DataLayout, IndexType> right(rightRange); left.setRandom(); right.setRandom(); DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); /// - Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); //concatenation = left.concatenate(right, 0); gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); @@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); - for (int j = 0; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); } - for (int i = 2; i < 4; ++i) { + for (IndexType i = 2; i < 4; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); } } sycl_device.deallocate(gpu_out_data1); - Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); @@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); } - for (int j = 3; j < 6; ++j) { + for (IndexType j = 3; j < 6; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); } } sycl_device.deallocate(gpu_out_data2); - Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); @@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); } @@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) sycl_device.deallocate(gpu_in1_data); sycl_device.deallocate(gpu_in2_data); } -template<typename DataType, int DataLayout, typename Index> +template<typename DataType, int DataLayout, typename IndexType> static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}}; + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}}; - Index concatDim1 = 4; - Index concatDim2 = 3; - Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}}; + IndexType concatDim1 = 4; + IndexType concatDim2 = 3; + Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}}; - Tensor<DataType, 2, DataLayout, Index> left(leftRange); - Tensor<DataType, 2, DataLayout, Index> right(rightRange); - Tensor<DataType, 2, DataLayout, Index> result(resRange); + Tensor<DataType, 2, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 2, DataLayout, IndexType> right(rightRange); + Tensor<DataType, 2, DataLayout, IndexType> result(resRange); left.setRandom(); right.setRandom(); @@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); @@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(left(i, j), result(i, j)); VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); } @@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_concatenation<DataType, RowMajor, int>(sycl_device); - test_simple_concatenation<DataType, ColMajor, int>(sycl_device); - test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device); + test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device); + test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device); + test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_concatenation_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index cb8fcb74c..41acd5579 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include <iostream> @@ -28,39 +28,39 @@ using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -static const float error_threshold =1e-4f; -typedef Tensor<float, 1>::DimensionPair DimPair; -template<int DataLayout, typename Device> -void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size) +template<int DataLayout, typename DataType, typename IndexType, typename Device> +void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 2, DataLayout> t_result(m_size, n_size); - Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); // Eigen::array<DimPair, 1> dims(DimPair(1, 0)); Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; - Eigen::array<int, 2> left_dims = {{m_size, k_size}}; - Eigen::array<int, 2> right_dims = {{k_size, n_size}}; - Eigen::array<int, 2> result_dims = {{m_size, n_size}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -70,14 +70,14 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in t_result = t_left.contract(t_right, dims); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) << " vs " << t_result_gpu(i) << std::endl; assert(false); } @@ -86,19 +86,21 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.deallocate(d_t_result); } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_TF(const Device& sycl_device) { - Eigen::array<long, 2> left_dims = {{2, 3}}; - Eigen::array<long, 2> right_dims = {{3, 1}}; - Eigen::array<long, 2> res_dims = {{2, 1}}; + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Eigen::array<IndexType, 2> left_dims = {{2, 3}}; + Eigen::array<IndexType, 2> right_dims = {{3, 1}}; + Eigen::array<IndexType, 2> res_dims = {{2, 1}}; Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; - Tensor<float, 2, DataLayout, long> t_left(left_dims); - Tensor<float, 2, DataLayout, long> t_right(right_dims); - Tensor<float, 2, DataLayout, long> t_result_gpu(res_dims); - Tensor<float, 2, DataLayout, long> t_result(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); t_left.data()[0] = 1.0f; t_left.data()[1] = 2.0f; @@ -111,18 +113,18 @@ void test_TF(const Device& sycl_device) t_right.data()[1] = 0.5f; t_right.data()[2] = 2.0f; - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size()*sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size()*sizeof(DataType); - float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_result(d_t_result, res_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -132,14 +134,14 @@ void test_TF(const Device& sycl_device) t_result = t_left.contract(t_right, dims); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) << " vs " << t_result_gpu(i) << std::endl; assert(false); } @@ -150,35 +152,37 @@ void test_TF(const Device& sycl_device) } -template<int DataLayout, typename Device> -void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) +template<int DataLayout, typename DataType, typename IndexType, typename Device> +void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 0, DataLayout> t_result; - Tensor<float, 0, DataLayout> t_result_gpu; + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 0, DataLayout, IndexType> t_result; + Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu; Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; - Eigen::array<int, 2> left_dims = {{m_size, k_size}}; - Eigen::array<int, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = sizeof(DataType); - float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -188,7 +192,7 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) t_result = t_left.contract(t_right, dims); - if (static_cast<float>(fabs(t_result() - t_result_gpu())) > error_threshold && + if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold && !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { std::cout << "mismatch detected: " << t_result() << " vs " << t_result_gpu() << std::endl; @@ -201,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_m(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_k(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_n(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_sizes(const Device& sycl_device) { - int m_sizes[] = { 31, 39, 63, 64, 65, + IndexType m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; - int n_sizes[] = { 31, 39, 63, 64, 65, + IndexType n_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - int k_sizes[] = { 31, 39, 63, 64, 65, + IndexType k_sizes[] = { 31, 39, 63, 64, 65, 95, 96, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); } } } @@ -250,26 +254,26 @@ void test_sycl_contraction_sizes(const Device& sycl_device) { template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32); - test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32); - test_scalar<ColMajor>(sycl_device, 32, 32, 32); - test_scalar<RowMajor>(sycl_device, 32, 32, 32); + test_sycl_contraction<ColMajor, float,ptrdiff_t>(sycl_device, 32, 32, 32); + test_sycl_contraction<RowMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32); + test_scalar<ColMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32); + test_scalar<RowMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32); std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128); - test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128); - test_scalar<ColMajor>(sycl_device, 128, 128, 128); - test_scalar<RowMajor>(sycl_device, 128, 128, 128); - test_sycl_contraction_m<ColMajor>(sycl_device); - test_sycl_contraction_m<RowMajor>(sycl_device); - test_sycl_contraction_n<ColMajor>(sycl_device); - test_sycl_contraction_n<RowMajor>(sycl_device); - test_sycl_contraction_k<ColMajor>(sycl_device); - test_sycl_contraction_k<RowMajor>(sycl_device); - test_sycl_contraction_sizes<ColMajor>(sycl_device); - test_sycl_contraction_sizes<RowMajor>(sycl_device); - test_TF<RowMajor>(sycl_device); - test_TF<ColMajor>(sycl_device); + test_sycl_contraction<ColMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128); + test_sycl_contraction<RowMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128); + test_scalar<ColMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128); + test_scalar<RowMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128); + test_sycl_contraction_m<ColMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_m<RowMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_n<ColMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_n<RowMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_k<ColMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_k<RowMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_sizes<ColMajor, float, ptrdiff_t>(sycl_device); + test_sycl_contraction_sizes<RowMajor, float, ptrdiff_t>(sycl_device); + test_TF<RowMajor, float, ptrdiff_t>(sycl_device); + test_TF<ColMajor, float, ptrdiff_t>(sycl_device); end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds = end-start; diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp index f7e0a2742..a4226a63a 100644 --- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include <iostream> @@ -35,12 +35,12 @@ static const float error_threshold =1e-4f; template <typename DataType, int DataLayout, typename IndexType> static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=55; - int outdim2=51; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=55; + IndexType outdim2=51; Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; Eigen::array<IndexType, 1> kernel_dims = {{4}}; Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; @@ -76,9 +76,9 @@ static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; assert(false); @@ -96,12 +96,12 @@ for(int i=0; i< outdim0; i++ ){ template <typename DataType, int DataLayout, typename IndexType> static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=51; - int outdim2=51; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=51; Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; Eigen::array<IndexType, 2> kernel_dims = {{4,5}}; Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; @@ -137,9 +137,9 @@ static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; assert(false); @@ -157,12 +157,12 @@ for(int i=0; i< outdim0; i++ ){ template <typename DataType, int DataLayout, typename IndexType> static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=51; - int outdim2=49; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=49; Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}}; Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; @@ -198,9 +198,9 @@ static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; assert(false); @@ -446,20 +446,20 @@ static void test_strides(const Eigen::SyclDevice& sycl_device){ template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_larg_expr1D<float, RowMajor, ptrdiff_t>(sycl_device); - test_larg_expr1D<float, ColMajor, ptrdiff_t>(sycl_device); - test_larg_expr2D<float, RowMajor, ptrdiff_t>(sycl_device); - test_larg_expr2D<float, ColMajor, ptrdiff_t>(sycl_device); - test_larg_expr3D<float, RowMajor, ptrdiff_t>(sycl_device); - test_larg_expr3D<float, ColMajor, ptrdiff_t>(sycl_device); - test_evals<float, ColMajor, ptrdiff_t>(sycl_device); - test_evals<float, RowMajor, ptrdiff_t>(sycl_device); - test_expr<float, ColMajor, ptrdiff_t>(sycl_device); - test_expr<float, RowMajor, ptrdiff_t>(sycl_device); - test_modes<float, ColMajor, ptrdiff_t>(sycl_device); - test_modes<float, RowMajor, ptrdiff_t>(sycl_device); - test_strides<float, ColMajor, ptrdiff_t>(sycl_device); - test_strides<float, RowMajor, ptrdiff_t>(sycl_device); + test_larg_expr1D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr1D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr2D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr2D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr3D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr3D<float, ColMajor, int64_t>(sycl_device); + test_evals<float, ColMajor, int64_t>(sycl_device); + test_evals<float, RowMajor, int64_t>(sycl_device); + test_expr<float, ColMajor, int64_t>(sycl_device); + test_expr<float, RowMajor, int64_t>(sycl_device); + test_modes<float, ColMajor, int64_t>(sycl_device); + test_modes<float, RowMajor, int64_t>(sycl_device); + test_strides<float, ColMajor, int64_t>(sycl_device); + test_strides<float, RowMajor, int64_t>(sycl_device); } void test_cxx11_tensor_convolution_sycl() { diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 190dba862..3ecc68df0 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,35 +22,35 @@ #include <stdint.h> #include <iostream> -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_device_memory(const Eigen::SyclDevice &sycl_device) { std::cout << "Running on : " << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl; - int sizeDim1 = 100; - array<int, 1> tensorRange = {{sizeDim1}}; - Tensor<DataType, 1, DataLayout> in(tensorRange); - Tensor<DataType, 1, DataLayout> in1(tensorRange); + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorRange = {{sizeDim1}}; + Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange); memset(in1.data(), 1, in1.size() * sizeof(DataType)); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); - for (int i=0; i<in.size(); i++) { + for (IndexType i=0; i<in.size(); i++) { VERIFY_IS_EQUAL(in(i), in1(i)); } sycl_device.deallocate(gpu_in_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_device_exceptions(const Eigen::SyclDevice &sycl_device) { VERIFY(sycl_device.ok()); - int sizeDim1 = 100; - array<int, 1> tensorDims = {{sizeDim1}}; + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorDims = {{sizeDim1}}; DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType))); sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType)); - TensorMap<Tensor<DataType, 1, DataLayout>> in(gpu_data, tensorDims); - TensorMap<Tensor<DataType, 1, DataLayout>> out(gpu_data, tensorDims); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims); out.device(sycl_device) = in / in.constant(0); sycl_device.synchronize(); @@ -62,8 +62,8 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_device_memory<DataType, RowMajor>(sycl_device); - test_device_memory<DataType, ColMajor>(sycl_device); + test_device_memory<DataType, RowMajor, int64_t>(sycl_device); + test_device_memory<DataType, ColMajor, int64_t>(sycl_device); /// this test throw an exception. enable it if you want to see the exception //test_device_exceptions<DataType, RowMajor>(sycl_device); /// this test throw an exception. enable it if you want to see the exception diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 4d19a3b2a..aca036cde 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -14,23 +14,23 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 20; - int sizeDim3 = 20; - Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Eigen::Tensor<DataType, 3, DataLayout> in1(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout> in2(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 20; + IndexType sizeDim3 = 20; + Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); @@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { in2 = in2.random() + in2.constant(10.0f); // creating TensorMap from tensor - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k)); } @@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_forced_eval_sycl<DataType, RowMajor>(sycl_device); - test_forced_eval_sycl<DataType, ColMajor>(sycl_device); + test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_forced_eval_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 91353b81a..9b521bc6b 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,18 +28,18 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) { - typename Tensor<DataType, 5 ,DataLayout>::Dimensions dim1(2,3,1,7,1); - typename Tensor<DataType, 3 ,DataLayout>::Dimensions dim2(2,3,7); - typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim3(6,7); - typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim4(2,21); + typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1); + typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21); - Tensor<DataType, 5, DataLayout> tensor1(dim1); - Tensor<DataType, 3, DataLayout> tensor2(dim2); - Tensor<DataType, 2, DataLayout> tensor3(dim3); - Tensor<DataType, 2, DataLayout> tensor4(dim4); + Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1); + Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2); + Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3); + Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4); tensor1.setRandom(); @@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, dim1); - TensorMap<Tensor<DataType, 3,DataLayout>> gpu2(gpu_data2, dim2); - TensorMap<Tensor<DataType, 2,DataLayout>> gpu3(gpu_data3, dim3); - TensorMap<Tensor<DataType, 2,DataLayout>> gpu4(gpu_data4, dim4); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1); + TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4); sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); @@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4); sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor @@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) } -template<typename DataType, int DataLayout> +template<typename DataType, int DataLayout, typename IndexType> static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) { - typename Tensor<DataType, 3, DataLayout>::Dimensions dim1(2,3,7); - typename Tensor<DataType, 2, DataLayout>::Dimensions dim2(6,7); - typename Tensor<DataType, 5, DataLayout>::Dimensions dim3(2,3,1,7,1); - Tensor<DataType, 3, DataLayout> tensor(dim1); - Tensor<DataType, 2, DataLayout> tensor2d(dim2); - Tensor<DataType, 5, DataLayout> tensor5d(dim3); + typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7); + typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7); + typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1); + Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1); + Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2); + Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3); tensor.setRandom(); @@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType))); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType))); - TensorMap< Tensor<DataType, 3, DataLayout> > gpu1(gpu_data1, dim1); - TensorMap< Tensor<DataType, 2, DataLayout> > gpu2(gpu_data2, dim2); - TensorMap< Tensor<DataType, 5, DataLayout> > gpu3(gpu_data3, dim3); + TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1); + TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2); + TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); @@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor @@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_slice(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 2; - int sizeDim2 = 3; - int sizeDim3 = 5; - int sizeDim4 = 7; - int sizeDim5 = 11; - array<int, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 5,DataLayout> tensor(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange); tensor.setRandom(); - array<int, 5> slice1_range ={{1, 1, 1, 1, 1}}; - Tensor<DataType, 5,DataLayout> slice1(slice1_range); + array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; + Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu2(gpu_data2, slice1_range); - Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5); - Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); + Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); + Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.slice(indices, sizes); sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - array<int, 5> slice2_range ={{1,1,2,2,3}}; - Tensor<DataType, 5,DataLayout> slice2(slice2_range); + array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; + Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu3(gpu_data3, slice2_range); - Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5); - Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); + Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); + Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 3; ++k) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); } } @@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); - for(int i=0;i<sizeDim1;i++) for(int j=0;j<sizeDim2;j++){ + for(IndexType i=0;i<sizeDim1;i++) + for(IndexType j=0;j<sizeDim2;j++){ VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j)); } sycl_device.deallocate(gpu_data1); @@ -230,12 +231,12 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_slice<DataType, RowMajor>(sycl_device); - test_simple_slice<DataType, ColMajor>(sycl_device); - test_simple_reshape<DataType, RowMajor>(sycl_device); - test_simple_reshape<DataType, ColMajor>(sycl_device); - test_reshape_as_lvalue<DataType, RowMajor>(sycl_device); - test_reshape_as_lvalue<DataType, ColMajor>(sycl_device); + test_simple_slice<DataType, RowMajor, int64_t>(sycl_device); + test_simple_slice<DataType, ColMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device); } diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp index 9e86e4b52..dc748b73e 100644 --- a/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.pad(paddings); sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); - for (int i = 0; i < padedSizeDim1; ++i) { - for (int j = 0; j < padedSizeDim2; ++j) { - for (int k = 0; k < padedSizeDim3; ++k) { - for (int l = 0; l < padedSizeDim4; ++l) { + for (IndexType i = 0; i < padedSizeDim1; ++i) { + for (IndexType j = 0; j < padedSizeDim2; ++j) { + for (IndexType k = 0; k < padedSizeDim3; ++k) { + for (IndexType l = 0; l < padedSizeDim4; ++l) { if (j >= 2 && j < 5 && k >= 3 && k < 8) { VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); } else { @@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 6; ++j) { - for (int k = 0; k < 12; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 6; ++j) { + for (IndexType k = 0; k < 12; ++k) { + for (IndexType l = 0; l < 7; ++l) { const float result_value = DataLayout == ColMajor ? result(i+2*j,k+12*l) : result(j+6*i,l+7*k); if (j >= 2 && j < 5 && k >= 3 && k < 8) { @@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_padding<DataType, RowMajor, int>(sycl_device); - test_simple_padding<DataType, ColMajor, int>(sycl_device); - test_padded_expr<DataType, RowMajor, int>(sycl_device); - test_padded_expr<DataType, ColMajor, int>(sycl_device); test_simple_padding<DataType, RowMajor, int64_t>(sycl_device); test_simple_padding<DataType, ColMajor, int64_t>(sycl_device); test_padded_expr<DataType, RowMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 941469029..98a59a14c 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -14,23 +14,23 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - const int num_rows = 452; - const int num_cols = 765; - array<int, 2> tensorRange = {{num_rows, num_cols}}; + const IndexType num_rows = 452; + const IndexType num_cols = 765; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; - Tensor<DataType, 2, DataLayout> in(tensorRange); - Tensor<DataType, 0, DataLayout> full_redux; - Tensor<DataType, 0, DataLayout> full_redux_gpu; + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; in.setRandom(); @@ -39,8 +39,8 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(); @@ -51,21 +51,21 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - int dim_x = 145; - int dim_y = 1; - int dim_z = 67; + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; - array<int, 2> reduced_tensorRange = {{dim_y, dim_z}}; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; - Tensor<DataType, 3, DataLayout> in(tensorRange); - Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); @@ -74,37 +74,37 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) + for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) + for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { - int dim_x = 567; - int dim_y = 1; - int dim_z = 47; + IndexType dim_x = 567; + IndexType dim_y = 1; + IndexType dim_z = 47; - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 2; - array<int, 2> reduced_tensorRange = {{dim_x, dim_y}}; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; - Tensor<DataType, 3, DataLayout> in(tensorRange); - Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); @@ -113,15 +113,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) + for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) + for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); @@ -133,12 +133,12 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl:: QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_sycl<DataType, RowMajor>(sycl_device); - test_first_dim_reductions_sycl<DataType, RowMajor>(sycl_device); - test_last_dim_reductions_sycl<DataType, RowMajor>(sycl_device); - test_full_reductions_sycl<DataType, ColMajor>(sycl_device); - test_first_dim_reductions_sycl<DataType, ColMajor>(sycl_device); - test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device); + test_full_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_first_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_first_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_reduction_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 73b394c18..2f5484484 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -14,24 +14,24 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - int dim1 = 2; - int dim2 = 3; - int dim3 = 5; - int dim4 = 7; + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; - array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor<DataType, 4, DataLayout> tensor(tensorRange); - Tensor<DataType, 4, DataLayout> reversed_tensor(tensorRange); + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange); tensor.setRandom(); array<bool, 4> dim_rev; @@ -43,17 +43,17 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu(gpu_out_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); } } @@ -67,10 +67,10 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); } } @@ -84,10 +84,10 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); } } @@ -100,18 +100,18 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) { - int dim1 = 2; - int dim2 = 3; - int dim3 = 5; - int dim4 = 7; - - array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor<DataType, 4, DataLayout> tensor(tensorRange); - Tensor<DataType, 4, DataLayout> expected(tensorRange); - Tensor<DataType, 4, DataLayout> result(tensorRange); + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; + + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange); tensor.setRandom(); array<bool, 4> dim_rev; @@ -124,9 +124,9 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_result(gpu_out_data_result, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); @@ -139,20 +139,20 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - array<int, 4> src_slice_dim; + array<IndexType, 4> src_slice_dim; src_slice_dim[0] = 2; src_slice_dim[1] = 3; src_slice_dim[2] = 1; src_slice_dim[3] = 7; - array<int, 4> src_slice_start; + array<IndexType, 4> src_slice_start; src_slice_start[0] = 0; src_slice_start[1] = 0; src_slice_start[2] = 0; src_slice_start[3] = 0; - array<int, 4> dst_slice_dim = src_slice_dim; - array<int, 4> dst_slice_start = src_slice_start; + array<IndexType, 4> dst_slice_dim = src_slice_dim; + array<IndexType, 4> dst_slice_start = src_slice_start; - for (int i = 0; i < 5; ++i) { + for (IndexType i = 0; i < 5; ++i) { if (LValue) { out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); @@ -165,10 +165,10 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue } sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < expected.dimension(0); ++i) { - for (int j = 0; j < expected.dimension(1); ++j) { - for (int k = 0; k < expected.dimension(2); ++k) { - for (int l = 0; l < expected.dimension(3); ++l) { + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); } } @@ -178,7 +178,7 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dst_slice_start[2] = 0; result.setRandom(); sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 5; ++i) { + for (IndexType i = 0; i < 5; ++i) { if (LValue) { out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); @@ -190,10 +190,10 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue } sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < expected.dimension(0); ++i) { - for (int j = 0; j < expected.dimension(1); ++j) { - for (int k = 0; k < expected.dimension(2); ++k) { - for (int l = 0; l < expected.dimension(3); ++l) { + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); } } @@ -207,12 +207,12 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_reverse<DataType, RowMajor>(sycl_device); - test_simple_reverse<DataType, ColMajor>(sycl_device); - test_expr_reverse<DataType, RowMajor>(sycl_device, false); - test_expr_reverse<DataType, ColMajor>(sycl_device, false); - test_expr_reverse<DataType, RowMajor>(sycl_device, true); - test_expr_reverse<DataType, ColMajor>(sycl_device, true); + test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true); } void test_cxx11_tensor_reverse_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index c4521aac8..c88db7c72 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,20 +28,20 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout, typename IndexTypes> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { - IndexTypes sizeDim1 = 2; - IndexTypes sizeDim2 = 3; - IndexTypes sizeDim3 = 5; - IndexTypes sizeDim4 = 7; - array<IndexTypes, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexTypes> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexTypes> no_shuffle(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange); tensor.setRandom(); const size_t buffSize =tensor.size()*sizeof(DataType); - array<IndexTypes, 4> shuffles; + array<IndexType, 4> shuffles; shuffles[0] = 0; shuffles[1] = 1; shuffles[2] = 2; @@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); @@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3); VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); } } @@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - array<IndexTypes, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor<DataType, 4, DataLayout,IndexTypes> shuffle(tensorrangeShuffle); + array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4,DataLayout,IndexTypes>> gpu3(gpu_data3, tensorrangeShuffle); + TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle); gpu3.device(sycl_device)=gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); @@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2); VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); } } @@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_shuffling_sycl<DataType, RowMajor, int>(sycl_device); - test_simple_shuffling_sycl<DataType, ColMajor, int>(sycl_device); - test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device); test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp index 2cbb18f1c..603c3746f 100644 --- a/unsupported/test/cxx11_tensor_striding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include <iostream> @@ -72,10 +72,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(no_stride.dimension(2), 5); VERIFY_IS_EQUAL(no_stride.dimension(3), 7); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); } } @@ -97,10 +97,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(stride.dimension(2), 3); VERIFY_IS_EQUAL(stride.dimension(3), 3); - for (int i = 0; i < 1; ++i) { - for (int j = 0; j < 1; ++j) { - for (int k = 0; k < 3; ++k) { - for (int l = 0; l < 3; ++l) { + for (IndexType i = 0; i < 1; ++i) { + for (IndexType j = 0; j < 1; ++j) { + for (IndexType k = 0; k < 3; ++k) { + for (IndexType l = 0; l < 3; ++l) { VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); } } @@ -151,10 +151,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); } } @@ -172,10 +172,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); } } @@ -190,10 +190,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_simple_striding<float, ColMajor, ptrdiff_t>(sycl_device); - test_simple_striding<float, RowMajor, ptrdiff_t>(sycl_device); - test_striding_as_lvalue<float, ColMajor, ptrdiff_t>(sycl_device); - test_striding_as_lvalue<float, RowMajor, ptrdiff_t>(sycl_device); + test_simple_striding<float, ColMajor, int64_t>(sycl_device); + test_simple_striding<float, RowMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device); } void test_cxx11_tensor_striding_sycl() { diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 6f7e29890..5cd0f4c71 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -27,24 +27,24 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3, DataLayout> in1(tensorRange); - Tensor<DataType, 3, DataLayout> out1(tensorRange); - Tensor<DataType, 3, DataLayout> out2(tensorRange); - Tensor<DataType, 3, DataLayout> out3(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange); in1 = in1.random(); DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); @@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); @@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_data2); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { - int size = 20; - array<int, 1> tensorRange = {{size}}; - Tensor<DataType, 1, DataLayout> in1(tensorRange); - Tensor<DataType, 1, DataLayout> in2(tensorRange); - Tensor<DataType, 1, DataLayout> out(tensorRange); + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange); in1 = in1.random(); in2 = in1; DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 1, DataLayout>> gpu1(gpu_data, tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.synchronize(); in1.setZero(); @@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out(i), in2(i)); } sycl_device.deallocate(gpu_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3,DataLayout> in1(tensorRange); - Tensor<DataType, 3,DataLayout> in2(tensorRange); - Tensor<DataType, 3,DataLayout> in3(tensorRange); - Tensor<DataType, 3,DataLayout> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); in2 = in2.random(); in3 = in3.random(); @@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType))); DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in3(gpu_in3_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(in1(i,j,k), 1.2f); } } @@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 1.2f); } @@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in2(i,j,k)); @@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k)); @@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in1(i,j,k)); @@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 3.14f + in2(i,j,k) * 2.7f); @@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k)); @@ -229,26 +229,26 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } -template<typename Scalar1, typename Scalar2, int DataLayout> +template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType> static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ - int size = 20; - array<int, 1> tensorRange = {{size}}; - Tensor<Scalar1, 1, DataLayout> in(tensorRange); - Tensor<Scalar2, 1, DataLayout> out(tensorRange); - Tensor<Scalar2, 1, DataLayout> out_host(tensorRange); + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange); in = in.random(); Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); - TensorMap<Tensor<Scalar1, 1, DataLayout>> gpu_in(gpu_in_data, tensorRange); - TensorMap<Tensor<Scalar2, 1, DataLayout>> gpu_out(gpu_out_data, tensorRange); + TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange); + TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); out_host = in. template cast<Scalar2>(); - for(int i=0; i< size; i++) + for(IndexType i=0; i< size; i++) { VERIFY_IS_APPROX(out(i), out_host(i)); } @@ -259,14 +259,14 @@ static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_sycl_mem_transfers<DataType, RowMajor>(sycl_device); - test_sycl_computations<DataType, RowMajor>(sycl_device); - test_sycl_mem_sync<DataType, RowMajor>(sycl_device); - test_sycl_mem_transfers<DataType, ColMajor>(sycl_device); - test_sycl_computations<DataType, ColMajor>(sycl_device); - test_sycl_mem_sync<DataType, ColMajor>(sycl_device); - test_sycl_cast<DataType, int, RowMajor>(sycl_device); - test_sycl_cast<DataType, int, ColMajor>(sycl_device); + test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_sycl() { |