diff options
Diffstat (limited to 'tensorflow/stream_executor/lib/casts.h')
-rw-r--r-- | tensorflow/stream_executor/lib/casts.h | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h new file mode 100644 index 0000000000..61ff2ab00e --- /dev/null +++ b/tensorflow/stream_executor/lib/casts.h @@ -0,0 +1,85 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ + +#include <stdlib.h> + +namespace perftools { +namespace gputools { +namespace port { + +// port::bit_cast<Dest,Source> is a template function that implements the +// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in +// very low-level functions like the protobuf library and fast math +// support. +// +// float f = 3.14159265358979; +// int i = port::bit_cast<int32>(f); +// // i = 0x40490fdb +// +// The classical address-casting method is: +// +// // WRONG +// float f = 3.14159265358979; // WRONG +// int i = * reinterpret_cast<int*>(&f); // WRONG +// +// The address-casting method actually produces undefined behavior +// according to ISO C++ specification section 3.10 -15 -. Roughly, this +// section says: if an object in memory has one type, and a program +// accesses it with a different type, then the result is undefined +// behavior for most values of "different type". +// +// This is true for any cast syntax, either *(int*)&f or +// *reinterpret_cast<int*>(&f). And it is particularly true for +// conversions between integral lvalues and floating-point lvalues. +// +// The purpose of 3.10 -15- is to allow optimizing compilers to assume +// that expressions with different types refer to different memory. gcc +// 4.0.1 has an optimizer that takes advantage of this. So a +// non-conforming program quietly produces wildly incorrect output. +// +// The problem is not the use of reinterpret_cast. The problem is type +// punning: holding an object in memory of one type and reading its bits +// back using a different type. +// +// The C++ standard is more subtle and complex than this, but that +// is the basic idea. +// +// Anyways ... +// +// port::bit_cast<> calls memcpy() which is blessed by the standard, +// especially by the example in section 3.9 . Also, of course, +// port::bit_cast<> wraps up the nasty logic in one place. +// +// Fortunately memcpy() is very fast. In optimized mode, with a +// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline +// code with the minimal amount of data movement. On a 32-bit system, +// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8) +// compiles to two loads and two stores. +// +// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1. +// +// WARNING: if Dest or Source is a non-POD type, the result of the memcpy +// is likely to surprise you. +// +// Props to Bill Gibbons for the compile time assertion technique and +// Art Komninos and Igor Tandetnik for the msvc experiments. +// +// -- mec 2005-10-17 + +template <class Dest, class Source> +inline Dest bit_cast(const Source& source) { + // Compile time assertion: sizeof(Dest) == sizeof(Source) + // A compile error here means your Dest and Source have different sizes. + static_assert(sizeof(Dest) == sizeof(Source), + "src and dst types must have equal sizes"); + + Dest dest; + memcpy(&dest, &source, sizeof(dest)); + return dest; +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ |