Reducing warnings in Sycl backend.

author: Mehdi Goli <mehdi.goli@codeplay.com> 2017-02-01 15:29:53 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2017-02-01 15:29:53 +0000
commit: bab29936a1cf0a68ffe4ccb1fd9b4807a3ec87ae (patch)
tree: c750b36227a31ddb2a1e0d5fd11f0036fda775db /unsupported/Eigen/CXX11/src
parent: 48a20b7d956433713a39e04d39cba443b7a763de (diff)
4 files changed, 69 insertions, 71 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index dc16f89e0..e87de0c57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -22,7 +22,7 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
 namespace Eigen {
 
-template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
+template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
     public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
@@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
     this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
+    LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
                        this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
                        this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
   }
@@ -162,8 +162,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
 template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
 typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
-int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
+typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
+typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
   typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
   typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
   typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
@@ -224,84 +224,83 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
       auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
       // Matmul Kernel
       // Thread identifiers
-      const int mLocalThreadId = itemID.get_local(0); // Local ID row
-      const int nLocalThreadId = itemID.get_local(1); // Local ID col
-      const int mGroupId = itemID.get_group(0); // Work-group ID row
-      const int nGroupId = itemID.get_group(1); // Work-group ID localCol
-      const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
+      const Index mLocalThreadId = itemID.get_local(0); // Local ID row
+      const Index nLocalThreadId = itemID.get_local(1); // Local ID col
+      const Index mGroupId = itemID.get_group(0); // Work-group ID row
+      const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
+      const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
       // Allocate register space
       float privateLhs;
       float privateRhs[WorkLoadPerThreadN];
       float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
       // Initialise the privateResumulation registers
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
               privateRes[wLPTM][wLPTN] = 0.0f;
           }
       }
 
       // Tile Lhs
-      for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-          int
-          localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localLhsRow =  localLhsLinearId% TileSizeDimM;
-          int localLhsCol = localLhsLinearId/TileSizeDimM;
+      for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+          Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+          Index localLhsCol = localLhsLinearId/TileSizeDimM;
           // Load the value (wide vector load)
-          int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
+          Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
           localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
       }
       // Tile Rhs
-      for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-          int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localRhsRow =  localRhsLinearId% TileSizeDimN;
-          int localRhsCol = localRhsLinearId/TileSizeDimN;
+      for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+          Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+          Index localRhsCol = localRhsLinearId/TileSizeDimN;
           // Load the value (wide vector load)
-          int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
+          Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
           localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
 
       }
       // Loop over all tiles
-      const int numTiles = roundUpK/TileSizeDimK;
-      int firstHalf=0;
+      const Index numTiles = roundUpK/TileSizeDimK;
+      Index firstHalf=0;
       do {
           // Synchronise
           itemID.barrier(cl::sycl::access::fence_space::local_space);
           // Load the next tile of Lhs and Rhs into local memory
-          int nextHalf = firstHalf + 1;
+          Index nextHalf = firstHalf + 1;
           if (nextHalf < numTiles) {
               // Tile A
-              for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-                  int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localLhsRow =  localLhsLinearId% TileSizeDimM;
-                  int localLhsCol = localLhsLinearId/TileSizeDimM;
+              for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+                  Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+                  Index localLhsCol = localLhsLinearId/TileSizeDimM;
                   // global K id
-                  int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
+                  Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
                   // Store the loaded value into local memory
                   localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
               }
               // Tile B
-              for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-                  int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localRhsRow =  localRhsLinearId% TileSizeDimN;
-                  int localRhsCol = localRhsLinearId/TileSizeDimN;
+              for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+                  Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+                  Index localRhsCol = localRhsLinearId/TileSizeDimN;
                   // Load the value (wide vector load)
-                  int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
+                  Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
                   // Store the loaded vector into local memory
                   localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
               }
           }
           // Loop over the values of a single tile
-          for (int k=0; k<TileSizeDimK; k++) {
+          for (Index k=0; k<TileSizeDimK; k++) {
               // Cache the values of localRhs in registers
-              for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                  int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
+              for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
                   privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
               }
               // Perform the computation
-              for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-                  int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
+              for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+                  Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
                   privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
-                  for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
                       privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
                   }
               }
@@ -311,11 +310,11 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
       } while (firstHalf<numTiles);
 
       // Store the final results in C
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
           if (globalRow< M){
-            for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
+            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
                 if(globalCol<N)
                   out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
             }
@@ -325,24 +324,24 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
     }
 
 };
-template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const int TileSizeDimM = 32;                                      // Tile size for dimension M
-static const int TileSizeDimN = 32;                                      // Tile size for dimension N
-static const int TileSizeDimK = 16;                                      // Tile size for dimension K
-static const int WorkLoadPerThreadM = 4;                                 // Work load per thread in dimension M
-static const int WorkLoadPerThreadN = 4;                                 // work load per thread in dimension N
-static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
-static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
-static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
-static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
+template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
+
+static const Index TileSizeDimM = 32ul;                                      // Tile size for dimension M
+static const Index TileSizeDimN = 32ul;                                      // Tile size for dimension N
+static const Index TileSizeDimK = 16ul;                                      // Tile size for dimension K
+static const Index WorkLoadPerThreadM = 4ul;                                 // Work load per thread in dimension M
+static const Index WorkLoadPerThreadN = 4ul;                                 // work load per thread in dimension N
+static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
+static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
+static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
+static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
 
 // RoundUp function to make sure that the global threadId is divisable by local threadId
-static int RoundUp(int x, int y) {
+static Index RoundUp(Index x, Index y) {
   return ((((x) + (y) - 1) / (y))*(y));
 }
 
-template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
+template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
   static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
     ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
     LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index e2569e1bf..c3e095b8a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -352,7 +352,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
           auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
           InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 1> indices{m_indices[0]};
+          const array<Index, 1> indices{{m_indices[0]}};
           const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
           internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
           cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 9858d0560..e209799bb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -194,7 +194,7 @@ struct SyclDevice {
     auto s=  sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
     std::transform(s.begin(), s.end(), s.begin(), ::tolower);
     if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      tileSize=std::min(static_cast<size_t>(256), static_cast<size_t>(tileSize));
+      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
     }
     rng = n;
     if (rng==0) rng=static_cast<Index>(1);
@@ -211,10 +211,10 @@ struct SyclDevice {
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
     if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
-    size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
     rng1=dim1;
     if (rng1==0 ) rng1=static_cast<Index>(1);
     GRange1=rng1;
@@ -241,10 +241,10 @@ struct SyclDevice {
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
     if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
-    size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
-    tileSize2 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/3)));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
     rng2=dim2;
     if (rng2==0 ) rng1=static_cast<Index>(1);
     GRange2=rng2;
@@ -253,8 +253,8 @@ struct SyclDevice {
       Index xMode =  static_cast<Index>(GRange2 % tileSize2);
       if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
     }
-    pow_of_2 = static_cast<size_t>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
+    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
     rng1=dim1;
     if (rng1==0 ) rng1=static_cast<Index>(1);
     GRange1=rng1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index 94692be56..cac785540 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -50,10 +50,9 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
 /// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename Index, typename Dimensions> struct DimensionSize{
-  static Index getDimSize(const Dimensions& dim){
+template<typename , typename Dimensions> struct DimensionSize{
+  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
     return dim.TotalSize();
-
   }
 };
 #define DIMSIZEMACRO(CVQual)\
author	Mehdi Goli <mehdi.goli@codeplay.com>	2017-02-01 15:29:53 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2017-02-01 15:29:53 +0000
commit	bab29936a1cf0a68ffe4ccb1fd9b4807a3ec87ae (patch)
tree	c750b36227a31ddb2a1e0d5fd11f0036fda775db /unsupported/Eigen/CXX11/src
parent	48a20b7d956433713a39e04d39cba443b7a763de (diff)