diff options
author | Pete Warden <petewarden@google.com> | 2016-12-15 15:04:48 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-12-15 15:25:57 -0800 |
commit | 6e2a233cae68adad9da01a233661517ef2acfebb (patch) | |
tree | ee4fc4def785a565e978f2242323b1b9ced5bd7e /tensorflow/core | |
parent | 1e6f5b6d269716116a1ce3239d0cd8a6c56b4d74 (diff) |
Test for CPU features on startup, to replace SIGILLs with useful errors and suggest optimizations
Change: 142193596
Diffstat (limited to 'tensorflow/core')
-rw-r--r-- | tensorflow/core/platform/cpu_feature_guard.cc | 106 | ||||
-rw-r--r-- | tensorflow/core/platform/cpu_info.cc | 289 | ||||
-rw-r--r-- | tensorflow/core/platform/cpu_info.h | 60 | ||||
-rw-r--r-- | tensorflow/core/platform/port_test.cc | 10 | ||||
-rw-r--r-- | tensorflow/core/platform/posix/port.cc | 26 |
5 files changed, 479 insertions, 12 deletions
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc new file mode 100644 index 0000000000..fd38b614c3 --- /dev/null +++ b/tensorflow/core/platform/cpu_feature_guard.cc @@ -0,0 +1,106 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +namespace port { +namespace { + +// Raises an error if the binary has been compiled for a CPU feature (like AVX) +// that isn't available on the current machine. It also warns of performance +// loss if there's a feature available that's not being used. +// Depending on the compiler and initialization order, a SIGILL exception may +// occur before this code is reached, but this at least offers a chance to give +// a more meaningful error message. +class CPUFeatureGuard { + public: + CPUFeatureGuard() { +#ifdef __SSE__ + CheckFeatureOrDie(CPUFeature::SSE, "SSE"); +#else + WarnIfFeatureUnused(CPUFeature::SSE, "SSE"); +#endif // __SSE__ +#ifdef __SSE2__ + CheckFeatureOrDie(CPUFeature::SSE2, "SSE2"); +#else + WarnIfFeatureUnused(CPUFeature::SSE2, "SSE2"); +#endif // __SSE2__ +#ifdef __SSE3__ + CheckFeatureOrDie(CPUFeature::SSE3, "SSE3"); +#else + WarnIfFeatureUnused(CPUFeature::SSE3, "SSE3"); +#endif // __SSE3__ +#ifdef __SSE4_1__ + CheckFeatureOrDie(CPUFeature::SSE4_1, "SSE4.1"); +#else + WarnIfFeatureUnused(CPUFeature::SSE4_1, "SSE4.1"); +#endif // __SSE4_1__ +#ifdef __SSE4_2__ + CheckFeatureOrDie(CPUFeature::SSE4_2, "SSE4.2"); +#else + WarnIfFeatureUnused(CPUFeature::SSE4_2, "SSE4.2"); +#endif // __SSE4_2__ +#ifdef __AVX__ + CheckFeatureOrDie(CPUFeature::AVX, "AVX"); +#else + WarnIfFeatureUnused(CPUFeature::AVX, "AVX"); +#endif // __AVX__ +#ifdef __AVX2__ + CheckFeatureOrDie(CPUFeature::AVX2, "AVX2"); +#else + WarnIfFeatureUnused(CPUFeature::AVX2, "AVX2"); +#endif // __AVX2__ +#ifdef __AVX512F__ + CheckFeatureOrDie(CPUFeature::AVX512F, "AVX512F"); +#else + WarnIfFeatureUnused(CPUFeature::AVX512F, "AVX512F"); +#endif // __AVX512F__ +#ifdef __FMA__ + CheckFeatureOrDie(CPUFeature::FMA, "FMA"); +#else + WarnIfFeatureUnused(CPUFeature::FMA, "FMA"); +#endif // __FMA__ + } + + void CheckFeatureOrDie(CPUFeature feature, const string& feature_name) { + if (!TestCPUFeature(feature)) { +#ifdef __ANDROID__ + // Some Android emulators seem to indicate they don't support SSE, so to + // avoid crashes when testing, switch this to a warning. + LOG(WARNING) +#else + LOG(FATAL) +#endif + << "The TensorFlow library was compiled to use " << feature_name + << " instructions, but these aren't available on your machine."; + } + } + + void WarnIfFeatureUnused(CPUFeature feature, const string& feature_name) { + if (TestCPUFeature(feature)) { + LOG(WARNING) << "The TensorFlow library wasn't compiled to use " + << feature_name + << " instructions, but these are available on your machine " + "and could speed up CPU computations."; + } + } +}; + +CPUFeatureGuard g_cpu_feature_guard_singleton; +} // namespace +} // namespace port +} // namespace tensorflow diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc new file mode 100644 index 0000000000..7bef9d9b4b --- /dev/null +++ b/tensorflow/core/platform/cpu_info.cc @@ -0,0 +1,289 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// SIMD extension querying is only available on x86. +#if defined(__x86_64__) || defined(__amd64__) +#define PORT_IS_X86 +#endif + +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" +#if defined(PORT_IS_X86) +#include <mutex> // NOLINT +#endif + +#ifdef PORT_IS_X86 +#define GETCPUID(a, b, c, d, a_inp, c_inp) \ + asm("mov %%rbx, %%rdi\n" \ + "cpuid\n" \ + "xchg %%rdi, %%rbx\n" \ + : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \ + : "a"(a_inp), "2"(c_inp)) +#endif + +namespace tensorflow { +namespace port { +namespace { + +#ifdef PORT_IS_X86 +class CPUIDInfo; +void InitCPUIDInfo(); + +CPUIDInfo *cpuid = nullptr; + +int GetXCR0EAX() { + int eax, edx; + asm("XGETBV" : "=a"(eax), "=d"(edx) : "c"(0)); + return eax; +} + +// Structure for basic CPUID info +struct CPUIDInfo { + CPUIDInfo() + : have_adx_(0), + have_aes_(0), + have_avx_(0), + have_avx2_(0), + have_avx512f_(0), + have_avx512cd_(0), + have_avx512er_(0), + have_avx512pf_(0), + have_avx512vl_(0), + have_avx512bw_(0), + have_avx512dq_(0), + have_avx512vbmi_(0), + have_avx512ifma_(0), + have_avx512_4vnniw_(0), + have_avx512_4fmaps_(0), + have_bmi1_(0), + have_bmi2_(0), + have_cmov_(0), + have_cmpxchg16b_(0), + have_cmpxchg8b_(0), + have_f16c_(0), + have_fma_(0), + have_mmx_(0), + have_pclmulqdq_(0), + have_popcnt_(0), + have_prefetchw_(0), + have_prefetchwt1_(0), + have_rdrand_(0), + have_rdseed_(0), + have_smap_(0), + have_sse_(0), + have_sse2_(0), + have_sse3_(0), + have_sse4_1_(0), + have_sse4_2_(0), + have_ssse3_(0), + have_hypervisor_(0) {} + + static void Initialize() { + // Initialize cpuid struct + CHECK(cpuid == NULL) << __func__ << " ran more than once"; + cpuid = new CPUIDInfo; + + uint32 eax, ebx, ecx, edx; + + // To get general information and extended features we send eax = 1 and + // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx. + // (See Intel 64 and IA-32 Architectures Software Developer's Manual + // Volume 2A: Instruction Set Reference, A-M CPUID). + GETCPUID(eax, ebx, ecx, edx, 1, 0); + + cpuid->have_aes_ = (ecx >> 25) & 0x1; + cpuid->have_cmov_ = (edx >> 15) & 0x1; + cpuid->have_cmpxchg16b_ = (ecx >> 13) & 0x1; + cpuid->have_cmpxchg8b_ = (edx >> 8) & 0x1; + cpuid->have_mmx_ = (edx >> 23) & 0x1; + cpuid->have_pclmulqdq_ = (ecx >> 1) & 0x1; + cpuid->have_popcnt_ = (ecx >> 23) & 0x1; + cpuid->have_rdrand_ = (ecx >> 30) & 0x1; + cpuid->have_sse2_ = (edx >> 26) & 0x1; + cpuid->have_sse3_ = ecx & 0x1; + cpuid->have_sse4_1_ = (ecx >> 19) & 0x1; + cpuid->have_sse4_2_ = (ecx >> 20) & 0x1; + cpuid->have_sse_ = (edx >> 25) & 0x1; + cpuid->have_ssse3_ = (ecx >> 9) & 0x1; + cpuid->have_hypervisor_ = (ecx >> 31) & 1; + + const uint64 xcr0_xmm_mask = 0x2; + const uint64 xcr0_ymm_mask = 0x4; + const uint64 xcr0_maskreg_mask = 0x20; + const uint64 xcr0_zmm0_15_mask = 0x40; + const uint64 xcr0_zmm16_31_mask = 0x80; + + const uint64 xcr0_avx_mask = xcr0_xmm_mask | xcr0_ymm_mask; + const uint64 xcr0_avx512_mask = xcr0_avx_mask | xcr0_maskreg_mask | + xcr0_zmm0_15_mask | xcr0_zmm16_31_mask; + + const bool have_avx = + // Does the OS support XGETBV instruction use by applications? + ((ecx >> 27) & 0x1) && + // Does the OS save/restore XMM and YMM state? + ((GetXCR0EAX() & xcr0_avx_mask) == xcr0_avx_mask) && + // Is AVX supported in hardware? + ((ecx >> 28) & 0x1); + + const bool have_avx512 = + // Does the OS support XGETBV instruction use by applications? + ((ecx >> 27) & 0x1) && + // Does the OS save/restore ZMM state? + ((GetXCR0EAX() & xcr0_avx512_mask) == xcr0_avx512_mask); + + cpuid->have_avx_ = have_avx; + cpuid->have_fma_ = have_avx && ((ecx >> 12) & 0x1); + cpuid->have_f16c_ = have_avx && ((ecx >> 29) & 0x1); + + // Get standard level 7 structured extension features (issue CPUID with + // eax = 7 and ecx= 0), which is required to check for AVX2 support as + // well as other Haswell (and beyond) features. (See Intel 64 and IA-32 + // Architectures Software Developer's Manual Volume 2A: Instruction Set + // Reference, A-M CPUID). + GETCPUID(eax, ebx, ecx, edx, 7, 0); + + cpuid->have_adx_ = (ebx >> 19) & 0x1; + cpuid->have_avx2_ = have_avx && ((ebx >> 5) & 0x1); + cpuid->have_bmi1_ = (ebx >> 3) & 0x1; + cpuid->have_bmi2_ = (ebx >> 8) & 0x1; + cpuid->have_prefetchwt1_ = ecx & 0x1; + cpuid->have_rdseed_ = (ebx >> 18) & 0x1; + cpuid->have_smap_ = (ebx >> 20) & 0x1; + + cpuid->have_avx512f_ = have_avx512 && ((ebx >> 16) & 0x1); + cpuid->have_avx512cd_ = have_avx512 && ((ebx >> 28) & 0x1); + cpuid->have_avx512er_ = have_avx512 && ((ebx >> 27) & 0x1); + cpuid->have_avx512pf_ = have_avx512 && ((ebx >> 26) & 0x1); + cpuid->have_avx512vl_ = have_avx512 && ((ebx >> 31) & 0x1); + cpuid->have_avx512bw_ = have_avx512 && ((ebx >> 30) & 0x1); + cpuid->have_avx512dq_ = have_avx512 && ((ebx >> 17) & 0x1); + cpuid->have_avx512vbmi_ = have_avx512 && ((ecx >> 1) & 0x1); + cpuid->have_avx512ifma_ = have_avx512 && ((ebx >> 21) & 0x1); + cpuid->have_avx512_4vnniw_ = have_avx512 && ((edx >> 2) & 0x1); + cpuid->have_avx512_4fmaps_ = have_avx512 && ((edx >> 3) & 0x1); + } + + static bool TestFeature(CPUFeature feature) { + InitCPUIDInfo(); + // clang-format off + switch (feature) { + case ADX: return cpuid->have_adx_; + case AES: return cpuid->have_aes_; + case AVX2: return cpuid->have_avx2_; + case AVX: return cpuid->have_avx_; + case AVX512F: return cpuid->have_avx512f_; + case AVX512CD: return cpuid->have_avx512cd_; + case AVX512PF: return cpuid->have_avx512pf_; + case AVX512ER: return cpuid->have_avx512er_; + case AVX512VL: return cpuid->have_avx512vl_; + case AVX512BW: return cpuid->have_avx512bw_; + case AVX512DQ: return cpuid->have_avx512dq_; + case AVX512VBMI: return cpuid->have_avx512vbmi_; + case AVX512IFMA: return cpuid->have_avx512ifma_; + case AVX512_4VNNIW: return cpuid->have_avx512_4vnniw_; + case AVX512_4FMAPS: return cpuid->have_avx512_4fmaps_; + case BMI1: return cpuid->have_bmi1_; + case BMI2: return cpuid->have_bmi2_; + case CMOV: return cpuid->have_cmov_; + case CMPXCHG16B: return cpuid->have_cmpxchg16b_; + case CMPXCHG8B: return cpuid->have_cmpxchg8b_; + case F16C: return cpuid->have_f16c_; + case FMA: return cpuid->have_fma_; + case MMX: return cpuid->have_mmx_; + case PCLMULQDQ: return cpuid->have_pclmulqdq_; + case POPCNT: return cpuid->have_popcnt_; + case PREFETCHW: return cpuid->have_prefetchw_; + case PREFETCHWT1: return cpuid->have_prefetchwt1_; + case RDRAND: return cpuid->have_rdrand_; + case RDSEED: return cpuid->have_rdseed_; + case SMAP: return cpuid->have_smap_; + case SSE2: return cpuid->have_sse2_; + case SSE3: return cpuid->have_sse3_; + case SSE4_1: return cpuid->have_sse4_1_; + case SSE4_2: return cpuid->have_sse4_2_; + case SSE: return cpuid->have_sse_; + case SSSE3: return cpuid->have_ssse3_; + case HYPERVISOR: return cpuid->have_hypervisor_; + default: + break; + } + // clang-format on + return false; + } + + private: + int highest_eax_; + int have_adx_ : 1; + int have_aes_ : 1; + int have_avx_ : 1; + int have_avx2_ : 1; + int have_avx512f_ : 1; + int have_avx512cd_ : 1; + int have_avx512er_ : 1; + int have_avx512pf_ : 1; + int have_avx512vl_ : 1; + int have_avx512bw_ : 1; + int have_avx512dq_ : 1; + int have_avx512vbmi_ : 1; + int have_avx512ifma_ : 1; + int have_avx512_4vnniw_ : 1; + int have_avx512_4fmaps_ : 1; + int have_bmi1_ : 1; + int have_bmi2_ : 1; + int have_cmov_ : 1; + int have_cmpxchg16b_ : 1; + int have_cmpxchg8b_ : 1; + int have_f16c_ : 1; + int have_fma_ : 1; + int have_mmx_ : 1; + int have_pclmulqdq_ : 1; + int have_popcnt_ : 1; + int have_prefetchw_ : 1; + int have_prefetchwt1_ : 1; + int have_rdrand_ : 1; + int have_rdseed_ : 1; + int have_smap_ : 1; + int have_sse_ : 1; + int have_sse2_ : 1; + int have_sse3_ : 1; + int have_sse4_1_ : 1; + int have_sse4_2_ : 1; + int have_ssse3_ : 1; + int have_hypervisor_ : 1; +}; + +std::once_flag cpuid_once_flag; + +void InitCPUIDInfo() { + // This ensures that CPUIDInfo::Initialize() is called exactly + // once regardless of how many threads concurrently call us + std::call_once(cpuid_once_flag, CPUIDInfo::Initialize); +} + +#endif // PORT_IS_X86 + +} // namespace + +bool TestCPUFeature(CPUFeature feature) { +#ifdef PORT_IS_X86 + return CPUIDInfo::TestFeature(feature); +#else + return false; +#endif +} + +} // namespace port +} // namespace tensorflow diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h index 13e89e7b7b..5fa58e3759 100644 --- a/tensorflow/core/platform/cpu_info.h +++ b/tensorflow/core/platform/cpu_info.h @@ -32,6 +32,66 @@ static const bool kLittleEndian = true; // software can change it dynamically. int NumSchedulableCPUs(); +// Mostly ISA related features that we care about +enum CPUFeature { + // Do not change numeric assignments. + MMX = 0, + SSE = 1, + SSE2 = 2, + SSE3 = 3, + SSSE3 = 4, + SSE4_1 = 5, + SSE4_2 = 6, + CMOV = 7, + CMPXCHG8B = 8, + CMPXCHG16B = 9, + POPCNT = 10, + AES = 11, + AVX = 12, + RDRAND = 13, + AVX2 = 14, + FMA = 15, + F16C = 16, + PCLMULQDQ = 17, + RDSEED = 18, + ADX = 19, + SMAP = 20, + + // Prefetch Vector Data Into Caches with Intent to Write and T1 Hint + // http://www.felixcloutier.com/x86/PREFETCHWT1.html. + // You probably want PREFETCHW instead. + PREFETCHWT1 = 21, + + BMI1 = 22, + BMI2 = 23, + HYPERVISOR = 25, // 0 when on a real CPU, 1 on (well-behaved) hypervisor. + + // Prefetch Data into Caches in Anticipation of a Write (3D Now!). + // http://www.felixcloutier.com/x86/PREFETCHW.html + PREFETCHW = 26, + + // AVX-512: 512-bit vectors (plus masking, etc.) in Knights Landing, + // Skylake + // Xeon, etc.; each of these entries is a different subset of + // instructions, + // various combinations of which occur on various CPU types. + AVX512F = 27, // Foundation + AVX512CD = 28, // Conflict detection + AVX512ER = 29, // Exponential and reciprocal + AVX512PF = 30, // Prefetching + AVX512VL = 31, // Shorter vector lengths + AVX512BW = 32, // Byte and word + AVX512DQ = 33, // Dword and qword + AVX512VBMI = 34, // Bit manipulation + AVX512IFMA = 35, // Integer multiply-add + AVX512_4VNNIW = 36, // Integer neural network + AVX512_4FMAPS = 37, // Floating point neural network +}; + +// Checks whether the current processor supports one of the features above. +// Checks CPU registers to return hardware capabilities. +bool TestCPUFeature(CPUFeature feature); + } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc index 402c718e4f..8d98eb25a2 100644 --- a/tensorflow/core/platform/port_test.cc +++ b/tensorflow/core/platform/port_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include <condition_variable> #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/test.h" @@ -66,5 +67,14 @@ TEST(ConditionVariable, WaitForMilliseconds_Signalled) { EXPECT_LT(finish - start, 3); } +TEST(TestCPUFeature, TestFeature) { + // We don't know what the result should be on this platform, so just make + // sure it's callable. + const bool has_avx = TestCPUFeature(CPUFeature::AVX); + LOG(INFO) << "has_avx = " << has_avx; + const bool has_avx2 = TestCPUFeature(CPUFeature::AVX2); + LOG(INFO) << "has_avx2 = " << has_avx2; +} + } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc index a7c797ebf2..84bc9492b5 100644 --- a/tensorflow/core/platform/posix/port.cc +++ b/tensorflow/core/platform/posix/port.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" #if defined(__linux__) && !defined(__ANDROID__) #include <sched.h> @@ -31,7 +33,7 @@ limitations under the License. namespace tensorflow { namespace port { -void InitMain(const char* usage, int* argc, char*** argv) {} +void InitMain(const char *usage, int *argc, char ***argv) {} string Hostname() { char hostname[1024]; @@ -58,15 +60,15 @@ int NumSchedulableCPUs() { return kDefaultCores; } -void* aligned_malloc(size_t size, int minimum_alignment) { +void *aligned_malloc(size_t size, int minimum_alignment) { #if defined(__ANDROID__) return memalign(minimum_alignment, size); #else // !defined(__ANDROID__) - void* ptr = NULL; + void *ptr = NULL; // posix_memalign requires that the requested alignment be at least // sizeof(void*). In this case, fall back on malloc which should return // memory aligned to at least the size of a pointer. - const int required_alignment = sizeof(void*); + const int required_alignment = sizeof(void *); if (minimum_alignment < required_alignment) return malloc(size); if (posix_memalign(&ptr, minimum_alignment, size) != 0) return NULL; @@ -75,19 +77,19 @@ void* aligned_malloc(size_t size, int minimum_alignment) { #endif } -void aligned_free(void* aligned_memory) { free(aligned_memory); } +void aligned_free(void *aligned_memory) { free(aligned_memory); } void MallocExtension_ReleaseToSystem(std::size_t num_bytes) { // No-op. } -std::size_t MallocExtension_GetAllocatedSize(const void* p) { return 0; } +std::size_t MallocExtension_GetAllocatedSize(const void *p) { return 0; } -void AdjustFilenameForLogging(string* filename) { +void AdjustFilenameForLogging(string *filename) { // Nothing to do } -bool Snappy_Compress(const char* input, size_t length, string* output) { +bool Snappy_Compress(const char *input, size_t length, string *output) { #ifdef SNAPPY output->resize(snappy::MaxCompressedLength(length)); size_t outlen; @@ -99,8 +101,8 @@ bool Snappy_Compress(const char* input, size_t length, string* output) { #endif } -bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result) { +bool Snappy_GetUncompressedLength(const char *input, size_t length, + size_t *result) { #ifdef SNAPPY return snappy::GetUncompressedLength(input, length, result); #else @@ -108,7 +110,7 @@ bool Snappy_GetUncompressedLength(const char* input, size_t length, #endif } -bool Snappy_Uncompress(const char* input, size_t length, char* output) { +bool Snappy_Uncompress(const char *input, size_t length, char *output) { #ifdef SNAPPY return snappy::RawUncompress(input, length, output); #else @@ -116,7 +118,7 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) { #endif } -string Demangle(const char* mangled) { return mangled; } +string Demangle(const char *mangled) { return mangled; } } // namespace port } // namespace tensorflow |