1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
/***************************************************************************
Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you
may not use this file except in compliance with the License. You may
obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
***************************************************************************/
.code 32
.fpu neon
.align 4
.globl memset32_neon
.func
/* r0 = buffer, r1 = value, r2 = times to write */
memset32_neon:
cmp r2, #1
streq r1, [r0], #4
bxeq lr
cmp r2, #4
bgt memset32_neon_start
cmp r2, #0
bxeq lr
memset32_neon_small:
str r1, [r0], #4
subs r2, r2, #1
bne memset32_neon_small
bx lr
memset32_neon_start:
cmp r2, #16
blt memset32_dropthru
vdup.32 q0, r1
vmov q1, q0
cmp r2, #32
blt memset32_16
cmp r2, #64
blt memset32_32
cmp r2, #128
blt memset32_64
memset32_128:
movs r12, r2, lsr #7
memset32_loop128:
subs r12, r12, #1
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
bne memset32_loop128
ands r2, r2, #0x7f
bxeq lr
memset32_64:
movs r12, r2, lsr #6
beq memset32_32
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
ands r2, r2, #0x3f
bxeq lr
memset32_32:
movs r12, r2, lsr #5
beq memset32_16
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
ands r2, r2, #0x1f
bxeq lr
memset32_16:
movs r12, r2, lsr #4
beq memset32_dropthru
and r2, r2, #0xf
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
memset32_dropthru:
rsb r2, r2, #15
add pc, pc, r2, lsl #2
nop
str r1, [r0, #56]
str r1, [r0, #52]
str r1, [r0, #48]
str r1, [r0, #44]
str r1, [r0, #40]
str r1, [r0, #36]
str r1, [r0, #32]
str r1, [r0, #28]
str r1, [r0, #24]
str r1, [r0, #20]
str r1, [r0, #16]
str r1, [r0, #12]
str r1, [r0, #8]
str r1, [r0, #4]
str r1, [r0, #0]
bx lr
.endfunc
.end
|