unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

namespace Eigen {
namespace internal {

namespace {

EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#if defined(EIGEN_GPU_COMPILE_PHASE)
  // We don't support 3d kernels since we currently only use 1 and
  // 2d kernels.
  gpu_assert(threadIdx.z == 0);
  return blockIdx.x * blockDim.x + threadIdx.x 
         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
#else
  // Rely on Eigen's random implementation.
  return random<uint64_t>();
#endif
}

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
  // TODO: Unify with the implementation in the non blocking thread pool.
  uint64_t current = *state;
  // Update the internal state
  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
  // Generate the random output (using the PCG-XSH-RS scheme)
  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
  seed = seed ? seed : get_random_seed();
  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
}

}  // namespace


template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  return static_cast<T>(rnd);
}


template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
  // Generate 10 random bits for the mantissa, merge with exponent.
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
  // Return the final result
  return result - Eigen::half(1.0f);
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {

  // Generate 7 random bits for the mantissa, merge with exponent.
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
  // Return the final result
  return result - Eigen::bfloat16(1.0f);
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
  typedef union {
    uint32_t raw;
    float fp;
  } internal;
  internal result;
  // Generate 23 random bits for the mantissa mantissa
  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
  result.raw = rnd & 0x7fffffu;
  // Set the exponent
  result.raw |= (static_cast<uint32_t>(127) << 23);
  // Return the final result
  return result.fp - 1.0f;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
  typedef union {
    uint64_t raw;
    double dp;
  } internal;
  internal result;
  result.raw = 0;
  // Generate 52 random bits for the mantissa
  // First generate the upper 20 bits
  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
  // The generate the lower 32 bits
  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
  // Set the exponent
  result.raw |= (static_cast<uint64_t>(1023) << 52);
  // Return the final result
  return result.dp - 1.0;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
                             RandomToTypeUniform<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
                              RandomToTypeUniform<double>(state, stream));
}

template <typename T> class UniformRandomGenerator {
 public:
  static const bool PacketAccess = true;

  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    // Therefor, we need two step to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
    // we add the thrreadID, BlockId,... (which is equivalent of i)
    //to the seed and construct the unique m_state per thead similar to cuda.
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      const UniformRandomGenerator& other) {
    m_state = other.m_state;
    #ifdef EIGEN_USE_SYCL
     m_exec_once =other.m_exec_once;
    #endif
  }

  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    T result = RandomToTypeUniform<T>(&m_state, i);
    return result;
  }

  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
      #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeUniform<T>(&m_state, i);
    }
    return internal::pload<Packet>(values);
  }

 private:
  mutable uint64_t m_state;
  #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
};

template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
  enum {
    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
    Cost = 12 * NumTraits<Scalar>::AddCost *
           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
  };
};


template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
  // Use the ratio of uniform method to generate numbers following a normal
  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
  // details.
  T u, v, q;
  do {
    u = RandomToTypeUniform<T>(state, stream);
    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
    const T x = u - T(0.449871);
    const T y = numext::abs(v) + T(0.386595);
    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
  } while (q > T(0.27597) &&
           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));

  return v/u;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
                             RandomToTypeNormal<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
                              RandomToTypeNormal<double>(state, stream));
}


template <typename T> class NormalRandomGenerator {
 public:
  static const bool PacketAccess = true;

  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    // Therefor, we need two steps to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Therefore, the thread Id injection is not available at this stage. However when the operator()
    //is called the thread ID will be avilable. So inside the opeator,
    // we add the thrreadID, BlockId,... (which is equivalent of i)
    //to the seed and construct the unique m_state per thead similar to cuda.
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
      const NormalRandomGenerator& other) {
    m_state = other.m_state;
#ifdef EIGEN_USE_SYCL
    m_exec_once=other.m_exec_once;
#endif
  }

 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    T result = RandomToTypeNormal<T>(&m_state, i);
    return result;
  }

  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeNormal<T>(&m_state, i);
    }
    return internal::pload<Packet>(values);
  }

 private:
  mutable uint64_t m_state;
   #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
};


template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
  enum {
    // On average, we need to generate about 3 random numbers
    // 15 mul, 8 add, 1.5 logs
    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
  };
};


} // end namespace internal
} // end namespace Eigen

#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H