aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/memset32_neon.S
diff options
context:
space:
mode:
authorGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-06-04 14:47:38 +0000
committerGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-06-04 14:47:38 +0000
commitaab4090b57ded8ad9dce95f92ad678c9829ab8bc (patch)
tree445610dc459347f3108b08bf335560924b73ccae /src/opts/memset32_neon.S
parent268013bfa6d5fbc2c6bf1452b467862e2a779120 (diff)
Add Neon versions of memset32 and memset16
Patch by pgalizia (of codeaurora.org) (Note: I don't read ARM and I didn't manage to find a reviewer for the ARM assembly code so this is landing somewhat unreviewed.) http://codereview.appspot.com/1157045/show git-svn-id: http://skia.googlecode.com/svn/trunk@573 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts/memset32_neon.S')
-rw-r--r--src/opts/memset32_neon.S122
1 files changed, 122 insertions, 0 deletions
diff --git a/src/opts/memset32_neon.S b/src/opts/memset32_neon.S
new file mode 100644
index 0000000000..9052c4f7d7
--- /dev/null
+++ b/src/opts/memset32_neon.S
@@ -0,0 +1,122 @@
+/***************************************************************************
+ Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+ .code 32
+ .fpu neon
+ .align 4
+ .globl memset32_neon
+ .func
+
+ /* r0 = buffer, r1 = value, r2 = times to write */
+memset32_neon:
+ cmp r2, #1
+ streq r1, [r0], #4
+ bxeq lr
+
+ cmp r2, #4
+ bgt memset32_neon_start
+ cmp r2, #0
+ bxeq lr
+memset32_neon_small:
+ str r1, [r0], #4
+ subs r2, r2, #1
+ bne memset32_neon_small
+ bx lr
+memset32_neon_start:
+ cmp r2, #16
+ blt memset32_dropthru
+ vdup.32 q0, r1
+ vmov q1, q0
+ cmp r2, #32
+ blt memset32_16
+ cmp r2, #64
+ blt memset32_32
+ cmp r2, #128
+ blt memset32_64
+memset32_128:
+ movs r12, r2, lsr #7
+memset32_loop128:
+ subs r12, r12, #1
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ bne memset32_loop128
+ ands r2, r2, #0x7f
+ bxeq lr
+memset32_64:
+ movs r12, r2, lsr #6
+ beq memset32_32
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ ands r2, r2, #0x3f
+ bxeq lr
+memset32_32:
+ movs r12, r2, lsr #5
+ beq memset32_16
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ ands r2, r2, #0x1f
+ bxeq lr
+memset32_16:
+ movs r12, r2, lsr #4
+ beq memset32_dropthru
+ and r2, r2, #0xf
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+memset32_dropthru:
+ rsb r2, r2, #15
+ add pc, pc, r2, lsl #2
+ nop
+ str r1, [r0, #56]
+ str r1, [r0, #52]
+ str r1, [r0, #48]
+ str r1, [r0, #44]
+ str r1, [r0, #40]
+ str r1, [r0, #36]
+ str r1, [r0, #32]
+ str r1, [r0, #28]
+ str r1, [r0, #24]
+ str r1, [r0, #20]
+ str r1, [r0, #16]
+ str r1, [r0, #12]
+ str r1, [r0, #8]
+ str r1, [r0, #4]
+ str r1, [r0, #0]
+ bx lr
+
+ .endfunc
+ .end