aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2020-11-17 15:32:44 -0800
committerGravatar Antonio Sánchez <cantonios@google.com>2020-11-18 20:32:35 +0000
commit17268b155d54422f1294130c0fb8c178757d911a (patch)
tree2be3d541729f3e9be6a180a58270bae10156df4f /unsupported/Eigen
parent41d5d5334b8a4e364dfd88dcd91f6cd38834b8ed (diff)
Add bit_cast for half/bfloat to/from uint16_t, fix TensorRandom
The existing `TensorRandom.h` implementation makes the assumption that `half` (`bfloat16`) has a `uint16_t` member `x` (`value`), which is not always true. This currently fails on arm64, where `x` has type `__fp16`. Added `bit_cast` specializations to allow casting to/from `uint16_t` for both `half` and `bfloat16`. Also added tests in `half_float`, `bfloat16_float`, and `cxx11_tensor_random` to catch these errors in the future.
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h47
1 files changed, 22 insertions, 25 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index ea286fee1..13450e1a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -91,24 +91,21 @@ T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
- Eigen::half result;
- // Generate 10 random bits for the mantissa
+ // Generate 10 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
- result.x = static_cast<uint16_t>(rnd & 0x3ffu);
- // Set the exponent
- result.x |= (static_cast<uint16_t>(15) << 10);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+ Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
// Return the final result
return result - Eigen::half(1.0f);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
- Eigen::bfloat16 result;
- // Generate 7 random bits for the mantissa
+
+ // Generate 7 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
- result.value = static_cast<uint16_t>(rnd & 0x7fu);
- // Set the exponent
- result.value |= (static_cast<uint16_t>(127) << 7);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+ Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
// Return the final result
return result - Eigen::bfloat16(1.0f);
}
@@ -169,19 +166,19 @@ template <typename T> class UniformRandomGenerator {
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two step to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
+ // and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
- // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
- // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
- // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
- // similar to CUDA Therefore, the thread Id injection is not available at this stage.
- //However when the operator() is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
+ // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+ // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+ // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+ // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+ //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}
@@ -282,16 +279,16 @@ template <typename T> class NormalRandomGenerator {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two steps to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
+ // and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
- // Therefore, the thread Id injection is not available at this stage. However when the operator()
- //is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
+ // Therefore, the thread Id injection is not available at this stage. However when the operator()
+ //is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}