diff options
author | 2016-04-15 08:40:22 -0700 | |
---|---|---|
committer | 2016-04-15 08:40:23 -0700 | |
commit | 86498fbfcb93a9048bbe1c28cc0df40d8d0c96e9 (patch) | |
tree | a579fdee9a3cf330050dbe7fd0a3a25d49783693 /src/core | |
parent | 2c7f24093a394ccbe54a7db60ba79af14682e7fa (diff) |
Revert of Move CPU feature detection to its own file. (patchset #7 id:120001 of https://codereview.chromium.org/1890483002/ )
Reason for revert:
many unexpected GM diffs across GPU+CPU configs on Windows (hopefully just text masks on GPU?). seems like we pick a different srcover variant in some places.
Original issue's description:
> Move CPU feature detection to its own file.
>
> - Moves CPU feature detection to its own file.
> - Cleans up some redundant feature detection scattered around core/ and opts/.
> - Can now detect a few new CPU features:
> * F16C -> Intel f16<->f32 instructions, added between AVX and AVX2
> * FMA -> Intel FMA instructions, added at the same time as AVX2
> * VFP_FP16 -> ARM f16<->f32 instructions, quite common
> * NEON_FMA -> ARM FMA instructions, also quite common
> * SSE and SSE3... why not?
>
> This new internal API makes it very cheap to do fine-grained runtime CPU
> feature detection. Redundant calls to SkCpu::Supports() should be eliminated
> and it's hoistable out of loops. It compiles away entirely when we have the
> appropriate instructions available at compile time.
>
> This means we can call it to guard even a little snippet of 1 or 2 instructions
> right where needed and let inlining hoist the check (if any at all) up to
> somewhere that doesn't hurt performance. I've explained how I made this work
> in the private section of the new header.
>
> Once this lands and bakes a bit, I'll start following up with CLs to use it more
> and to add a bunch of those little 1-2 instruction snippets we've been wanting,
> e.g. cvtps2ph, cvtph2ps, ptest, pmulld, pmovzxbd, blendvps, pshufb, roundps
> (for floor) on x86, and vcvt.f32.f16, vcvt.f16.f32 on ARM.
>
> BUG=skia:
> GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1890483002
> CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
>
> Committed: https://skia.googlesource.com/skia/+/872ea29357439f05b1f6995dd300fc054733e607
TBR=fmalita@chromium.org,herb@google.com,reed@google.com,mtklein@chromium.org
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:
Review URL: https://codereview.chromium.org/1892643003
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/SkCpu.cpp | 90 | ||||
-rw-r--r-- | src/core/SkCpu.h | 123 | ||||
-rw-r--r-- | src/core/SkOpts.cpp | 62 | ||||
-rw-r--r-- | src/core/SkUtilsArm.cpp | 139 | ||||
-rw-r--r-- | src/core/SkUtilsArm.h | 14 |
5 files changed, 198 insertions, 230 deletions
diff --git a/src/core/SkCpu.cpp b/src/core/SkCpu.cpp deleted file mode 100644 index 4030fce2e1..0000000000 --- a/src/core/SkCpu.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include "SkCpu.h" -#include "SkOncePtr.h" - -#if defined(SK_CPU_X86) - #if defined(SK_BUILD_FOR_WIN32) - #include <intrin.h> - static void cpuid (uint32_t abcd[4]) { __cpuid ((int*)abcd, 1); } - static void cpuid7(uint32_t abcd[4]) { __cpuidex((int*)abcd, 7, 0); } - static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); } - #else - #include <cpuid.h> - #if !defined(__cpuid_count) // Old Mac Clang doesn't have this defined. - #define __cpuid_count(eax, ecx, a, b, c, d) \ - __asm__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eax), "2"(ecx)) - #endif - static void cpuid (uint32_t abcd[4]) { __get_cpuid(1, abcd+0, abcd+1, abcd+2, abcd+3); } - static void cpuid7(uint32_t abcd[4]) { - __cpuid_count(7, 0, abcd[0], abcd[1], abcd[2], abcd[3]); - } - static uint64_t xgetbv(uint32_t xcr) { - uint32_t eax, edx; - __asm__ __volatile__ ( "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (uint64_t)(edx) << 32 | eax; - } - #endif - - static uint32_t read_cpu_features() { - uint32_t features = 0; - uint32_t abcd[4] = {0,0,0,0}; - - // You might want to refer to http://www.sandpile.org/x86/cpuid.htm - - cpuid(abcd); - if (abcd[3] & (1<<25)) { features |= SkCpu:: SSE1; } - if (abcd[3] & (1<<26)) { features |= SkCpu:: SSE2; } - if (abcd[2] & (1<< 0)) { features |= SkCpu:: SSE3; } - if (abcd[2] & (1<< 9)) { features |= SkCpu::SSSE3; } - if (abcd[2] & (1<<19)) { features |= SkCpu::SSE41; } - if (abcd[2] & (1<<20)) { features |= SkCpu::SSE42; } - - if ((abcd[2] & (3<<26)) == (3<<26) && (xgetbv(0) & 6) == 6) { // XSAVE + OSXSAVE - if (abcd[2] & (1<<28)) { features |= SkCpu:: AVX; } - if (abcd[2] & (1<<29)) { features |= SkCpu::F16C; } - if (abcd[2] & (1<<12)) { features |= SkCpu:: FMA; } - - cpuid7(abcd); - if (abcd[1] & (1<<5)) { features |= SkCpu::AVX2; } - } - return features; - } - -#elif defined(SK_CPU_ARM32) && \ - defined(SK_BUILD_FOR_ANDROID) && \ - !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) - #include <cpu-features.h> - - static uint32_t read_cpu_features() { - uint32_t features = 0; - - uint64_t android_features = android_getCpuFeatures(); - if (android_features & ANDROID_CPU_ARM_FEATURE_NEON ) { features |= SkCpu::NEON ; } - if (android_features & ANDROID_CPU_ARM_FEATURE_NEON_FMA) { features |= SkCpu::NEON_FMA; } - if (android_features & ANDROID_CPU_ARM_FEATURE_VFP_FP16) { features |= SkCpu::VFP_FP16; } - return features; - } - -#else - static uint32_t read_cpu_features() { - return 0; - } - -#endif - -#if defined(__GNUC__) || defined(__clang__) - SK_DECLARE_STATIC_ONCE_PTR(uint32_t, gCachedCpuFeatures); - uint32_t SkCpu::RuntimeCpuFeatures() { - return *gCachedCpuFeatures.get([]{ return new uint32_t{read_cpu_features()}; }); - } - -#else - const uint32_t SkCpu::gCachedCpuFeatures = read_cpu_features(); - -#endif diff --git a/src/core/SkCpu.h b/src/core/SkCpu.h deleted file mode 100644 index 2a41d37b16..0000000000 --- a/src/core/SkCpu.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkCpu_DEFINED -#define SkCpu_DEFINED - -#include "SkTypes.h" - -struct SkCpu { - enum { - SSE1 = 1 << 0, - SSE2 = 1 << 1, - SSE3 = 1 << 2, - SSSE3 = 1 << 3, - SSE41 = 1 << 4, - SSE42 = 1 << 5, - AVX = 1 << 6, - F16C = 1 << 7, - FMA = 1 << 8, - AVX2 = 1 << 9, - }; - enum { - NEON = 1 << 0, - NEON_FMA = 1 << 1, - VFP_FP16 = 1 << 2, - }; - - static bool Supports(uint32_t); - -private: - // Consider a loop like this that expands 16-bit floats out to 32-bit, does math, and repacks: - // for (int i = 0; i < N; i++) { - // if (SkCpu::Supports(SkCpu::F16C)) { - // f32s = SkCpu::F16C_cvtph_ps(f16s); - // } else { - // f32s = some_slower_f16_to_f32_routine(f16s); - // } - // - // ... do some math with f32s ... - // - // if (SkCpu::Supports(SkCpu::F16C)) { - // f16s = SkCpu::F16C_cvtps_ph(f32s); - // } else { - // f16s = some_slower_f32_to_f16_routine(f32s); - // } - // } - // - // We would like SkCpu::Supports() to participate in common sub-expression elimination, - // so that it's called exactly 1 time, rather than N or 2N times. This is especially - // important when the if-else blocks you see above are really inline functions. - // - // The key to this is to make sure to implement RuntimeCpuFeatures() with the same - // capacity for common sub-expression elimination. - // - // __attribute__((const)) works perfectly when available. - // - // When it's not (MSVC), we fall back to a static initializer. - // (Static intializers would work fine everywhere, but Chrome really dislikes them.) - -#if defined(__GNUC__) || defined(__clang__) // i.e. GCC, Clang, or clang-cl - __attribute__((const)) - static uint32_t RuntimeCpuFeatures(); -#else - static const uint32_t gCachedCpuFeatures; - static uint32_t RuntimeCpuFeatures() { - return gCachedCpuFeatures; - } -#endif -}; - -inline bool SkCpu::Supports(uint32_t mask) { - uint32_t features = RuntimeCpuFeatures(); - - // If we mask in compile-time known lower limits, the compiler can completely - // drop many calls to RuntimeCpuFeatures(). -#if SK_CPU_X86 - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 - features |= SSE1; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - features |= SSE2; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE3 - features |= SSE3; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 - features |= SSSE3; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 - features |= SSE41; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 - features |= SSE42; - #endif - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX - features |= AVX; - #endif - // F16C goes here if we add SK_CPU_SSE_LEVEL_F16C - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - features |= AVX2; - #endif - // FMA doesn't fit neatly into this total ordering. - // It's available on Haswell+ just like AVX2, but it's technically a different bit. - // TODO: circle back on this if we find ourselves limited by lack of compile-time FMA - -#else - #if defined(SK_ARM_HAS_NEON) - features |= NEON; - #endif - - #if defined(SK_CPU_ARM64) - features |= NEON|NEON_FMA|VFP_FP16; - #endif - -#endif - return (features & mask) == mask; -} - -#endif//SkCpu_DEFINED diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index d97a680846..570e329094 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -5,7 +5,6 @@ * found in the LICENSE file. */ -#include "SkCpu.h" #include "SkHalf.h" #include "SkOnce.h" #include "SkOpts.h" @@ -33,6 +32,35 @@ namespace SK_OPTS_NS { } } +#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS) + #if defined(SK_BUILD_FOR_WIN32) + #include <intrin.h> + static void cpuid (uint32_t abcd[4]) { __cpuid ((int*)abcd, 1); } + static void cpuid7(uint32_t abcd[4]) { __cpuidex((int*)abcd, 7, 0); } + static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); } + #else + #include <cpuid.h> + #if !defined(__cpuid_count) // Old Mac Clang doesn't have this defined. + #define __cpuid_count(eax, ecx, a, b, c, d) \ + __asm__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eax), "2"(ecx)) + #endif + static void cpuid (uint32_t abcd[4]) { __get_cpuid(1, abcd+0, abcd+1, abcd+2, abcd+3); } + static void cpuid7(uint32_t abcd[4]) { + __cpuid_count(7, 0, abcd[0], abcd[1], abcd[2], abcd[3]); + } + static uint64_t xgetbv(uint32_t xcr) { + uint32_t eax, edx; + __asm__ __volatile__ ( "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (uint64_t)(edx) << 32 | eax; + } + #endif +#elif !defined(SK_ARM_HAS_NEON) && \ + defined(SK_CPU_ARM32) && \ + defined(SK_BUILD_FOR_ANDROID) && \ + !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) + #include <cpu-features.h> +#endif + namespace SkOpts { // Define default function pointer values here... @@ -84,16 +112,28 @@ namespace SkOpts { static void init() { // TODO: Chrome's not linking _sse* opts on iOS simulator builds. Bug or feature? #if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS) - if (SkCpu::Supports(SkCpu::SSSE3)) { Init_ssse3(); } - if (SkCpu::Supports(SkCpu::SSE41)) { Init_sse41(); } - if (SkCpu::Supports(SkCpu::SSE42)) { Init_sse42(); } - if (SkCpu::Supports(SkCpu::AVX )) { Init_avx(); } - if (SkCpu::Supports(SkCpu::AVX2 )) { Init_avx2(); } - - #elif defined(SK_CPU_ARM32) && \ - defined(SK_BUILD_FOR_ANDROID) && \ - !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) - if (SkCpu::Supports(SkCpu::NEON)) { Init_neon(); } + uint32_t abcd[] = {0,0,0,0}; + cpuid(abcd); + if (abcd[2] & (1<< 9)) { Init_ssse3(); } + if (abcd[2] & (1<<19)) { Init_sse41(); } + if (abcd[2] & (1<<20)) { Init_sse42(); } + + // AVX detection's kind of a pain. This is cribbed from Chromium. + if ( ( abcd[2] & (7<<26)) == (7<<26) && // Check bits 26-28 of ecx are all set, + (xgetbv(0) & 6 ) == 6 ){ // and check the OS supports XSAVE. + Init_avx(); + + // AVX2 additionally needs bit 5 set on ebx after calling cpuid(7). + uint32_t abcd7[] = {0,0,0,0}; + cpuid7(abcd7); + if (abcd7[1] & (1<<5)) { Init_avx2(); } + } + + #elif !defined(SK_ARM_HAS_NEON) && \ + defined(SK_CPU_ARM32) && \ + defined(SK_BUILD_FOR_ANDROID) && \ + !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) { Init_neon(); } #endif } diff --git a/src/core/SkUtilsArm.cpp b/src/core/SkUtilsArm.cpp index c29938fdfc..bf98fed476 100644 --- a/src/core/SkUtilsArm.cpp +++ b/src/core/SkUtilsArm.cpp @@ -5,4 +5,141 @@ * found in the LICENSE file. */ -// This file no longer needs to exist, but it's still referenced by Chrome's GYP / GN builds. +#include "SkUtilsArm.h" + +#if SK_ARM_NEON_IS_DYNAMIC + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <pthread.h> + +#if defined(SK_BUILD_FOR_ANDROID) +# include <cpu-features.h> +#endif + +// A function used to determine at runtime if the target CPU supports +// the ARM NEON instruction set. This implementation is Linux-specific. +static bool sk_cpu_arm_check_neon(void) { + // If we fail any of the following, assume we don't have NEON instructions + // This allows us to return immediately in case of error. + bool result = false; + +// Use the Android NDK's cpu-features helper library to detect NEON at runtime. +// See http://crbug.com/164154 to see why this is needed in Chromium for Android. +#ifdef SK_BUILD_FOR_ANDROID + + result = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0; + +#else // SK_BUILD_FOR_ANDROID + + // There is no user-accessible CPUID instruction on ARM that we can use. + // Instead, we must parse /proc/cpuinfo and look for the 'neon' feature. + // For example, here's a typical output (Nexus S running ICS 4.0.3): + /* + Processor : ARMv7 Processor rev 2 (v7l) + BogoMIPS : 994.65 + Features : swp half thumb fastmult vfp edsp thumbee neon vfpv3 + CPU implementer : 0x41 + CPU architecture: 7 + CPU variant : 0x2 + CPU part : 0xc08 + CPU revision : 2 + + Hardware : herring + Revision : 000b + Serial : 3833c77d6dc000ec + */ + char buffer[4096]; + + do { + // open /proc/cpuinfo + int fd = TEMP_FAILURE_RETRY(open("/proc/cpuinfo", O_RDONLY)); + if (fd < 0) { + SkDebugf("Could not open /proc/cpuinfo: %s\n", strerror(errno)); + break; + } + + // Read the file. To simplify our search, we're going to place two + // sentinel '\n' characters: one at the start of the buffer, and one at + // the end. This means we reserve the first and last buffer bytes. + buffer[0] = '\n'; + int size = TEMP_FAILURE_RETRY(read(fd, buffer+1, sizeof(buffer)-2)); + close(fd); + + if (size < 0) { // should not happen + SkDebugf("Could not read /proc/cpuinfo: %s\n", strerror(errno)); + break; + } + + SkDebugf("START /proc/cpuinfo:\n%.*s\nEND /proc/cpuinfo\n", + size, buffer+1); + + // Compute buffer limit, and place final sentinel + char* buffer_end = buffer + 1 + size; + buffer_end[0] = '\n'; + + // Now, find a line that starts with "Features", i.e. look for + // '\nFeatures ' in our buffer. + const char features[] = "\nFeatures\t"; + const size_t features_len = sizeof(features)-1; + + char* line = (char*) memmem(buffer, buffer_end - buffer, + features, features_len); + if (line == nullptr) { // Weird, no Features line, bad kernel? + SkDebugf("Could not find a line starting with 'Features'" + "in /proc/cpuinfo ?\n"); + break; + } + + line += features_len; // Skip the "\nFeatures\t" prefix + + // Find the end of the current line + char* line_end = (char*) memchr(line, '\n', buffer_end - line); + if (line_end == nullptr) + line_end = buffer_end; + + // Now find an instance of 'neon' in the flags list. We want to + // ensure it's only 'neon' and not something fancy like 'noneon' + // so check that it follows a space. + const char neon[] = " neon"; + const size_t neon_len = sizeof(neon)-1; + const char* flag = (const char*) memmem(line, line_end - line, + neon, neon_len); + if (flag == nullptr) + break; + + // Ensure it is followed by a space or a newline. + if (flag[neon_len] != ' ' && flag[neon_len] != '\n') + break; + + // Fine, we support Arm NEON ! + result = true; + + } while (0); + +#endif // SK_BUILD_FOR_ANDROID + + if (result) { + SkDEBUGF(("Device supports ARM NEON instructions!\n")); + } else { + SkDEBUGF(("Device does NOT support ARM NEON instructions!\n")); + } + return result; +} + +static pthread_once_t sOnce; +static bool sHasArmNeon; + +// called through pthread_once() +void sk_cpu_arm_probe_features(void) { + sHasArmNeon = sk_cpu_arm_check_neon(); +} + +bool sk_cpu_arm_has_neon(void) { + pthread_once(&sOnce, sk_cpu_arm_probe_features); + return sHasArmNeon; +} + +#endif // SK_ARM_NEON_IS_DYNAMIC diff --git a/src/core/SkUtilsArm.h b/src/core/SkUtilsArm.h index dde933bafa..317677115c 100644 --- a/src/core/SkUtilsArm.h +++ b/src/core/SkUtilsArm.h @@ -8,7 +8,6 @@ #ifndef SkUtilsArm_DEFINED #define SkUtilsArm_DEFINED -#include "SkCpu.h" #include "SkUtils.h" // Define SK_ARM_NEON_MODE to one of the following values @@ -38,13 +37,18 @@ // is ARMv7-A and supports Neon instructions. In DYNAMIC mode, this actually // probes the CPU at runtime (and caches the result). -static inline bool sk_cpu_arm_has_neon(void) { #if SK_ARM_NEON_IS_NONE +static inline bool sk_cpu_arm_has_neon(void) { return false; -#else - return SkCpu::Supports(SkCpu::NEON); -#endif } +#elif SK_ARM_NEON_IS_ALWAYS +static inline bool sk_cpu_arm_has_neon(void) { + return true; +} +#else // SK_ARM_NEON_IS_DYNAMIC + +extern bool sk_cpu_arm_has_neon(void) SK_PURE_FUNC; +#endif // Use SK_ARM_NEON_WRAP(symbol) to map 'symbol' to a NEON-specific symbol // when applicable. This will transform 'symbol' differently depending on |