Replace NEON assembly memset16 and memset32 with intrinsic versions.

According to bench/MemsetBench.cpp, I've got them somewhere between 10% slower and a percent or two faster than the old assembly. BUG=skia: CQ_EXTRA_TRYBOTS=client.skia.android:Test-Android-GCC-Nexus5-CPU-NEON-Arm7-Debug-Trybot Review URL: https://codereview.chromium.org/1075003002
author: mtklein <mtklein@chromium.org> 2015-04-10 06:24:58 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-04-10 06:24:58 -0700
commit: c5e0891029bed0f9619d67281a81f13983a9687b (patch)
tree: 400fad28de7f4d3f6f4c330016b84e5f3b663141 /src/opts/SkUtils_opts_arm_neon.cpp
parent: d0b5c33fda86a5fe91007b7875a28e8fa8ff70c8 (diff)
1 files changed, 66 insertions, 0 deletions
diff --git a/src/opts/SkUtils_opts_arm_neon.cpp b/src/opts/SkUtils_opts_arm_neon.cpp
new file mode 100644
index 0000000000..b7d0504617
--- /dev/null
+++ b/src/opts/SkUtils_opts_arm_neon.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkTypes.h"
+#include <arm_neon.h>
+
+void sk_memset32_neon(uint32_t dst[], uint32_t value, int count) {
+    uint32x4_t   v4  = vdupq_n_u32(value);
+    uint32x4x4_t v16 = { v4, v4, v4, v4 };
+
+    while (count >= 16) {
+        vst4q_u32(dst, v16);  // This swizzles, but we don't care: all lanes are the same, value.
+        dst   += 16;
+        count -= 16;
+    }
+    SkASSERT(count < 16);
+    switch (count / 4) {
+        case 3: vst1q_u32(dst, v4); dst += 4; count -= 4;
+        case 2: vst1q_u32(dst, v4); dst += 4; count -= 4;
+        case 1: vst1q_u32(dst, v4); dst += 4; count -= 4;
+    }
+    SkASSERT(count < 4);
+    if (count >= 2) {
+        vst1_u32(dst, vget_low_u32(v4));
+        dst   += 2;
+        count -= 2;
+    }
+    SkASSERT(count < 2);
+    if (count > 0) {
+        *dst = value;
+    }
+}
+
+void sk_memset16_neon(uint16_t dst[], uint16_t value, int count) {
+    uint16x8_t   v8  = vdupq_n_u16(value);
+    uint16x8x4_t v32 = { v8, v8, v8, v8 };
+
+    while (count >= 32) {
+        vst4q_u16(dst, v32);  // This swizzles, but we don't care: all lanes are the same, value.
+        dst   += 32;
+        count -= 32;
+    }
+    SkASSERT(count < 32);
+    switch (count / 8) {
+        case 3: vst1q_u16(dst, v8); dst += 8; count -= 8;
+        case 2: vst1q_u16(dst, v8); dst += 8; count -= 8;
+        case 1: vst1q_u16(dst, v8); dst += 8; count -= 8;
+    }
+    SkASSERT(count < 8);
+    if (count >= 4) {
+        vst1_u16(dst, vget_low_u16(v8));
+        dst   += 4;
+        count -= 4;
+    }
+    SkASSERT(count < 4);
+    switch (count) {
+        case 3: *dst++ = value;
+        case 2: *dst++ = value;
+        case 1: *dst   = value;
+    }
+}
+
author	mtklein <mtklein@chromium.org>	2015-04-10 06:24:58 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-04-10 06:24:58 -0700
commit	c5e0891029bed0f9619d67281a81f13983a9687b (patch)
tree	400fad28de7f4d3f6f4c330016b84e5f3b663141 /src/opts/SkUtils_opts_arm_neon.cpp
parent	d0b5c33fda86a5fe91007b7875a28e8fa8ff70c8 (diff)