diff options
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/Default/Half.h | 42 |
1 files changed, 40 insertions, 2 deletions
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 3779ebc98..12ed8f676 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -63,10 +63,37 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_GPU_FP16) +// We want to use the __half_raw struct from the HIP header file only during the device compile phase. +// This is required because of a quirk in the way TensorFlow GPU builds are done. +// When compiling TensorFlow source code with GPU support, files that +// * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc +// * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) +// +// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that +// * are defined in a file that gets compiled via hipcc AND +// * have Eigen::half as a pass-by-value argument AND +// * are called in a file that gets compiled via gcc +// +// In the scenario described above the caller and callee will see different versions +// of the Eigen::half base class __half_raw, and they will be compiled by different compilers +// +// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in +// the callee getting corrupted values for the Eigen::half argument. +// +// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves +// this error, and hence the following convoluted #if condition +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { +#if (defined(EIGEN_HAS_HIP_FP16) && !defined(EIGEN_HIP_DEVICE_COMPILE)) + // Eigen::half can be used as the datatype for some shared memory declarations (in Eigen and TF) + // (In HIP) The element type for shared memory declaration cannot have non-trivial constructors + // and hence the following special casing (which skips the zero-initilization) + // Note that this check gets done even in the host compilation phase, and hence the need for this + EIGEN_DEVICE_FUNC __half_raw() {} +#else EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} +#endif #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) { } @@ -115,7 +142,10 @@ struct half : public half_impl::half_base { // Writing this out as separate #if-else blocks to make the code easier to follow // The same applies to most #if-else blocks in this file -#if !defined(EIGEN_HAS_GPU_FP16) +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) + // Use the same base class for the following two scenarios + // * when compiling without GPU support enabled + // * during host compile phase when compiling with GPU support enabled typedef half_impl::__half_raw __half_raw; #elif defined(EIGEN_HAS_HIP_FP16) // Nothing to do here @@ -161,6 +191,14 @@ struct half : public half_impl::half_base { EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. return half_impl::half_to_float(*this); } + +#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) + EIGEN_DEVICE_FUNC operator __half() const { + ::__half_raw hr; + hr.x = x; + return __half(hr); + } +#endif }; } // end namespace Eigen |