diff options
author | 2015-04-10 06:24:58 -0700 | |
---|---|---|
committer | 2015-04-10 06:24:58 -0700 | |
commit | c5e0891029bed0f9619d67281a81f13983a9687b (patch) | |
tree | 400fad28de7f4d3f6f4c330016b84e5f3b663141 /src/opts/SkUtils_opts_arm_neon.cpp | |
parent | d0b5c33fda86a5fe91007b7875a28e8fa8ff70c8 (diff) |
Replace NEON assembly memset16 and memset32 with intrinsic versions.
According to bench/MemsetBench.cpp, I've got them somewhere between 10% slower
and a percent or two faster than the old assembly.
BUG=skia:
CQ_EXTRA_TRYBOTS=client.skia.android:Test-Android-GCC-Nexus5-CPU-NEON-Arm7-Debug-Trybot
Review URL: https://codereview.chromium.org/1075003002
Diffstat (limited to 'src/opts/SkUtils_opts_arm_neon.cpp')
-rw-r--r-- | src/opts/SkUtils_opts_arm_neon.cpp | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/src/opts/SkUtils_opts_arm_neon.cpp b/src/opts/SkUtils_opts_arm_neon.cpp new file mode 100644 index 0000000000..b7d0504617 --- /dev/null +++ b/src/opts/SkUtils_opts_arm_neon.cpp @@ -0,0 +1,66 @@ +/* + * Copyright 2015 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkTypes.h" +#include <arm_neon.h> + +void sk_memset32_neon(uint32_t dst[], uint32_t value, int count) { + uint32x4_t v4 = vdupq_n_u32(value); + uint32x4x4_t v16 = { v4, v4, v4, v4 }; + + while (count >= 16) { + vst4q_u32(dst, v16); // This swizzles, but we don't care: all lanes are the same, value. + dst += 16; + count -= 16; + } + SkASSERT(count < 16); + switch (count / 4) { + case 3: vst1q_u32(dst, v4); dst += 4; count -= 4; + case 2: vst1q_u32(dst, v4); dst += 4; count -= 4; + case 1: vst1q_u32(dst, v4); dst += 4; count -= 4; + } + SkASSERT(count < 4); + if (count >= 2) { + vst1_u32(dst, vget_low_u32(v4)); + dst += 2; + count -= 2; + } + SkASSERT(count < 2); + if (count > 0) { + *dst = value; + } +} + +void sk_memset16_neon(uint16_t dst[], uint16_t value, int count) { + uint16x8_t v8 = vdupq_n_u16(value); + uint16x8x4_t v32 = { v8, v8, v8, v8 }; + + while (count >= 32) { + vst4q_u16(dst, v32); // This swizzles, but we don't care: all lanes are the same, value. + dst += 32; + count -= 32; + } + SkASSERT(count < 32); + switch (count / 8) { + case 3: vst1q_u16(dst, v8); dst += 8; count -= 8; + case 2: vst1q_u16(dst, v8); dst += 8; count -= 8; + case 1: vst1q_u16(dst, v8); dst += 8; count -= 8; + } + SkASSERT(count < 8); + if (count >= 4) { + vst1_u16(dst, vget_low_u16(v8)); + dst += 4; + count -= 4; + } + SkASSERT(count < 4); + switch (count) { + case 3: *dst++ = value; + case 2: *dst++ = value; + case 1: *dst = value; + } +} + |