Applying Benoit's comment. Embedding synchronisation inside device memcpy so there is no need to externally call synchronise() for device memcopy.

author: Mehdi Goli <mehdi.goli@codeplay.com> 2017-01-18 10:45:28 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2017-01-18 10:45:28 +0000
commit: c6f7b338343ead9617558857c91fd3e03e347c3f (patch)
tree: c9c918874106a910eca40518608b27e42d8fa57e /unsupported/Eigen/CXX11
parent: e46e7223817cfd982edec6d8e25c77e8e2493d78 (diff)
4 files changed, 13 insertions, 27 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 268d9d148..8f8d1caad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -234,7 +234,7 @@ struct SyclDevice {
       auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
       cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset));
     });
-    asynchronousExec();
+    synchronize();
   }
 
   /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
@@ -265,7 +265,7 @@ struct SyclDevice {
       auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
     });
-    asynchronousExec();
+    synchronize();
   }
   /// returning the sycl queue
   EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@@ -308,7 +308,7 @@ struct SyclDevice {
   }
 
   EIGEN_STRONG_INLINE void asynchronousExec() const {
-    ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.    
+    ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
     //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
     sycl_queue().wait_and_throw(); //pass
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 822e22c2d..abe85c860 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -143,12 +143,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  CoeffReturnType* data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
 
   /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
   /// used by sycl in order to build the sycl buffer
-  const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index dbe11c7af..6ddd2ca18 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -736,22 +736,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
       if(m_strides[i]>0){
-        #ifndef __SYCL_DEVICE_ONLY__
         startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-        #else
-        startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
-        stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
-        #endif
       }else{
       /* implies m_strides[i]<0 by assert */
-      #ifndef __SYCL_DEVICE_ONLY__
         startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        #else
-        startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
-        stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
-        #endif
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
@@ -867,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   }
 
   static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef __SYCL_DEVICE_ONLY__
     return numext::maxi(min, numext::mini(max,value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
   }
 
   array<Index, NumDims> m_outputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 93615e5c2..2237140e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -121,11 +121,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-#ifndef __SYCL_DEVICE_ONLY__
-      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
-#else
-      m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
-#endif
+      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -233,8 +229,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   /// required by sycl in order to extract the accessor
   Strides functor() const { return m_strides; }
 
-
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
@@ -264,7 +258,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   const Strides m_strides;
 };
 
-
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -299,10 +292,9 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   }
 
   /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
   /// required by sycl in order to extract the accessor
-  Strides functor() const { return this->m_strides; }
-
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
author	Mehdi Goli <mehdi.goli@codeplay.com>	2017-01-18 10:45:28 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2017-01-18 10:45:28 +0000
commit	c6f7b338343ead9617558857c91fd3e03e347c3f (patch)
tree	c9c918874106a910eca40518608b27e42d8fa57e /unsupported/Eigen/CXX11
parent	e46e7223817cfd982edec6d8e25c77e8e2493d78 (diff)