diff options
Diffstat (limited to 'src/opts/memset32_neon.S')
-rw-r--r-- | src/opts/memset32_neon.S | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/src/opts/memset32_neon.S b/src/opts/memset32_neon.S new file mode 100644 index 0000000000..9052c4f7d7 --- /dev/null +++ b/src/opts/memset32_neon.S @@ -0,0 +1,122 @@ +/*************************************************************************** + Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you + may not use this file except in compliance with the License. You may + obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied. See the License for the specific language governing + permissions and limitations under the License. + ***************************************************************************/ + + .code 32 + .fpu neon + .align 4 + .globl memset32_neon + .func + + /* r0 = buffer, r1 = value, r2 = times to write */ +memset32_neon: + cmp r2, #1 + streq r1, [r0], #4 + bxeq lr + + cmp r2, #4 + bgt memset32_neon_start + cmp r2, #0 + bxeq lr +memset32_neon_small: + str r1, [r0], #4 + subs r2, r2, #1 + bne memset32_neon_small + bx lr +memset32_neon_start: + cmp r2, #16 + blt memset32_dropthru + vdup.32 q0, r1 + vmov q1, q0 + cmp r2, #32 + blt memset32_16 + cmp r2, #64 + blt memset32_32 + cmp r2, #128 + blt memset32_64 +memset32_128: + movs r12, r2, lsr #7 +memset32_loop128: + subs r12, r12, #1 + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + bne memset32_loop128 + ands r2, r2, #0x7f + bxeq lr +memset32_64: + movs r12, r2, lsr #6 + beq memset32_32 + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + ands r2, r2, #0x3f + bxeq lr +memset32_32: + movs r12, r2, lsr #5 + beq memset32_16 + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + ands r2, r2, #0x1f + bxeq lr +memset32_16: + movs r12, r2, lsr #4 + beq memset32_dropthru + and r2, r2, #0xf + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! +memset32_dropthru: + rsb r2, r2, #15 + add pc, pc, r2, lsl #2 + nop + str r1, [r0, #56] + str r1, [r0, #52] + str r1, [r0, #48] + str r1, [r0, #44] + str r1, [r0, #40] + str r1, [r0, #36] + str r1, [r0, #32] + str r1, [r0, #28] + str r1, [r0, #24] + str r1, [r0, #20] + str r1, [r0, #16] + str r1, [r0, #12] + str r1, [r0, #8] + str r1, [r0, #4] + str r1, [r0, #0] + bx lr + + .endfunc + .end |