diff options
author | Antonio Sanchez <cantonios@google.com> | 2020-11-17 15:32:44 -0800 |
---|---|---|
committer | Antonio Sánchez <cantonios@google.com> | 2020-11-18 20:32:35 +0000 |
commit | 17268b155d54422f1294130c0fb8c178757d911a (patch) | |
tree | 2be3d541729f3e9be6a180a58270bae10156df4f /unsupported | |
parent | 41d5d5334b8a4e364dfd88dcd91f6cd38834b8ed (diff) |
Add bit_cast for half/bfloat to/from uint16_t, fix TensorRandom
The existing `TensorRandom.h` implementation makes the assumption that
`half` (`bfloat16`) has a `uint16_t` member `x` (`value`), which is not
always true. This currently fails on arm64, where `x` has type `__fp16`.
Added `bit_cast` specializations to allow casting to/from `uint16_t`
for both `half` and `bfloat16`. Also added tests in
`half_float`, `bfloat16_float`, and `cxx11_tensor_random` to catch
these errors in the future.
Diffstat (limited to 'unsupported')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 47 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_random.cpp | 18 |
2 files changed, 35 insertions, 30 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index ea286fee1..13450e1a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -91,24 +91,21 @@ T RandomToTypeUniform(uint64_t* state, uint64_t stream) { template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) { - Eigen::half result; - // Generate 10 random bits for the mantissa + // Generate 10 random bits for the mantissa, merge with exponent. unsigned rnd = PCG_XSH_RS_generator(state, stream); - result.x = static_cast<uint16_t>(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast<uint16_t>(15) << 10); + const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10); + Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits); // Return the final result return result - Eigen::half(1.0f); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) { - Eigen::bfloat16 result; - // Generate 7 random bits for the mantissa + + // Generate 7 random bits for the mantissa, merge with exponent. unsigned rnd = PCG_XSH_RS_generator(state, stream); - result.value = static_cast<uint16_t>(rnd & 0x7fu); - // Set the exponent - result.value |= (static_cast<uint16_t>(127) << 7); + const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits); // Return the final result return result - Eigen::bfloat16(1.0f); } @@ -169,19 +166,19 @@ template <typename T> class UniformRandomGenerator { uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. // Therefor, we need two step to initializate the m_state. // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is + // and we get the clock seed here from the CPU. However, This seed is //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. // and only available on the Operator() function (which is called on the GPU). - // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread - // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds - // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction - // similar to CUDA Therefore, the thread Id injection is not available at this stage. - //However when the operator() is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. m_exec_once =false; #endif } @@ -282,16 +279,16 @@ template <typename T> class NormalRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. // Therefor, we need two steps to initializate the m_state. // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is + // and we get the clock seed here from the CPU. However, This seed is //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. // and only available on the Operator() function (which is called on the GPU). - // Therefore, the thread Id injection is not available at this stage. However when the operator() - //is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. m_exec_once =false; #endif } diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp index 4740d5811..b9d4c5584 100644 --- a/unsupported/test/cxx11_tensor_random.cpp +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -11,9 +11,10 @@ #include <Eigen/CXX11/Tensor> +template<typename Scalar> static void test_default() { - Tensor<float, 1> vec(6); + Tensor<Scalar, 1> vec(6); vec.setRandom(); // Fixme: we should check that the generated numbers follow a uniform @@ -23,10 +24,11 @@ static void test_default() } } +template<typename Scalar> static void test_normal() { - Tensor<float, 1> vec(6); - vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>(); + Tensor<Scalar, 1> vec(6); + vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>(); // Fixme: we should check that the generated numbers follow a gaussian // distribution instead. @@ -72,7 +74,13 @@ static void test_custom() EIGEN_DECLARE_TEST(cxx11_tensor_random) { - CALL_SUBTEST(test_default()); - CALL_SUBTEST(test_normal()); + CALL_SUBTEST((test_default<float>())); + CALL_SUBTEST((test_normal<float>())); + CALL_SUBTEST((test_default<double>())); + CALL_SUBTEST((test_normal<double>())); + CALL_SUBTEST((test_default<Eigen::half>())); + CALL_SUBTEST((test_normal<Eigen::half>())); + CALL_SUBTEST((test_default<Eigen::bfloat16>())); + CALL_SUBTEST((test_normal<Eigen::bfloat16>())); CALL_SUBTEST(test_custom()); } |