From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Fri, 28 Jun 2019 10:08:23 +0100
Subject: [SYCL] This PR adds the minimum modifications to the Eigen
 unsupported module required to run it on devices supporting SYCL. *
 Abstracting the pointer type so that both SYCL memory and pointer can be
 captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen
 evaluator class. * Binding SYCL placeholder accessor to command group handler
 by using bind method in Eigen evaluator node. * Adding SYCL macro for
 controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL
 executor method to adopt the above changes.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 74 +++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 787cbd031..2be4f9cc5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -44,6 +45,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
   uint64_t rnd = ::random() ^ mach_absolute_time();
   return rnd;
 
+
 #else
   // Augment the current time with pseudo random number generation
   // to ensure that we get different seeds if we try to generate seeds
@@ -147,14 +149,41 @@ template <typename T> class UniformRandomGenerator {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
+    // Therefor, we need two step to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is 
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread 
+    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds  
+    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction 
+    // similar to CUDA Therefore, the thread Id injection is not available at this stage. 
+    //However when the operator() is called the thread ID will be avilable. So inside the opeator, 
+    // we add the thrreadID, BlockId,... (which is equivalent of i) 
+    //to the seed and construct the unique m_state per thead similar to cuda.  
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       const UniformRandomGenerator& other) {
     m_state = other.m_state;
+    #ifdef EIGEN_USE_SYCL
+     m_exec_once =other.m_exec_once;
+    #endif
   }
 
   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
+    #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
     T result = RandomToTypeUniform<T>(&m_state, i);
     return result;
   }
@@ -163,6 +192,14 @@ template <typename T> class UniformRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
+      #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
       values[j] = RandomToTypeUniform<T>(&m_state, i);
     }
@@ -171,6 +208,9 @@ template <typename T> class UniformRandomGenerator {
 
  private:
   mutable uint64_t m_state;
+  #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 template <typename Scalar>
@@ -222,14 +262,37 @@ template <typename T> class NormalRandomGenerator {
   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
+    // Therefor, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is 
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator() 
+    //is called the thread ID will be avilable. So inside the opeator, 
+    // we add the thrreadID, BlockId,... (which is equivalent of i) 
+    //to the seed and construct the unique m_state per thead similar to cuda.  
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
       const NormalRandomGenerator& other) {
     m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once=other.m_exec_once;
+#endif
   }
 
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
     T result = RandomToTypeNormal<T>(&m_state, i);
     return result;
   }
@@ -238,6 +301,14 @@ template <typename T> class NormalRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
       values[j] = RandomToTypeNormal<T>(&m_state, i);
     }
@@ -246,6 +317,9 @@ template <typename T> class NormalRandomGenerator {
 
  private:
   mutable uint64_t m_state;
+   #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 
-- 
cgit v1.2.3