From 17268b155d54422f1294130c0fb8c178757d911a Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Tue, 17 Nov 2020 15:32:44 -0800
Subject: Add bit_cast for half/bfloat to/from uint16_t, fix TensorRandom

The existing `TensorRandom.h` implementation makes the assumption that
`half` (`bfloat16`) has a `uint16_t` member `x` (`value`), which is not
always true. This currently fails on arm64, where `x` has type `__fp16`.
Added `bit_cast` specializations to allow casting to/from `uint16_t`
for both `half` and `bfloat16`.  Also added tests in
`half_float`, `bfloat16_float`, and `cxx11_tensor_random` to catch
these errors in the future.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 47 +++++++++++------------
 unsupported/test/cxx11_tensor_random.cpp          | 18 ++++++---
 2 files changed, 35 insertions(+), 30 deletions(-)

(limited to 'unsupported')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index ea286fee1..13450e1a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -91,24 +91,21 @@ T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
-  Eigen::half result;
-  // Generate 10 random bits for the mantissa
+  // Generate 10 random bits for the mantissa, merge with exponent.
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
-  // Set the exponent
-  result.x |= (static_cast<uint16_t>(15) << 10);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
   // Return the final result
   return result - Eigen::half(1.0f);
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
-  Eigen::bfloat16 result;
-  // Generate 7 random bits for the mantissa
+
+  // Generate 7 random bits for the mantissa, merge with exponent.
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  result.value = static_cast<uint16_t>(rnd & 0x7fu);
-  // Set the exponent
-  result.value |= (static_cast<uint16_t>(127) << 7);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
   // Return the final result
   return result - Eigen::bfloat16(1.0f);
 }
@@ -169,19 +166,19 @@ template <typename T> class UniformRandomGenerator {
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
     #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
     // Therefor, we need two step to initializate the m_state.
     // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is 
+    // and we get the clock seed here from the CPU. However, This seed is
     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
     // and only  available on the Operator() function (which is called on the GPU).
-    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread 
-    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds  
-    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction 
-    // similar to CUDA Therefore, the thread Id injection is not available at this stage. 
-    //However when the operator() is called the thread ID will be avilable. So inside the opeator, 
-    // we add the thrreadID, BlockId,... (which is equivalent of i) 
-    //to the seed and construct the unique m_state per thead similar to cuda.  
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
     m_exec_once =false;
    #endif
   }
@@ -282,16 +279,16 @@ template <typename T> class NormalRandomGenerator {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
     #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
     // Therefor, we need two steps to initializate the m_state.
     // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is 
+    // and we get the clock seed here from the CPU. However, This seed is
     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
     // and only  available on the Operator() function (which is called on the GPU).
-    // Therefore, the thread Id injection is not available at this stage. However when the operator() 
-    //is called the thread ID will be avilable. So inside the opeator, 
-    // we add the thrreadID, BlockId,... (which is equivalent of i) 
-    //to the seed and construct the unique m_state per thead similar to cuda.  
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    //is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
     m_exec_once =false;
    #endif
   }
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 4740d5811..b9d4c5584 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
 
 #include <Eigen/CXX11/Tensor>
 
+template<typename Scalar>
 static void test_default()
 {
-  Tensor<float, 1> vec(6);
+  Tensor<Scalar, 1> vec(6);
   vec.setRandom();
 
   // Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@ static void test_default()
   }
 }
 
+template<typename Scalar>
 static void test_normal()
 {
-  Tensor<float, 1> vec(6);
-  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+  Tensor<Scalar, 1> vec(6);
+  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
 
   // Fixme: we should check that the generated numbers follow a gaussian
   // distribution instead.
@@ -72,7 +74,13 @@ static void test_custom()
 
 EIGEN_DECLARE_TEST(cxx11_tensor_random)
 {
-  CALL_SUBTEST(test_default());
-  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST((test_default<float>()));
+  CALL_SUBTEST((test_normal<float>()));
+  CALL_SUBTEST((test_default<double>()));
+  CALL_SUBTEST((test_normal<double>()));
+  CALL_SUBTEST((test_default<Eigen::half>()));
+  CALL_SUBTEST((test_normal<Eigen::half>()));
+  CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+  CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
   CALL_SUBTEST(test_custom());
 }
-- 
cgit v1.2.3