aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/memset32_neon.S
blob: 9052c4f7d7b685cd559e3825538436e7fdb4393b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/***************************************************************************
 Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you
 may not use this file except in compliance with the License.  You may
 obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 implied.  See the License for the specific language governing
 permissions and limitations under the License.
 ***************************************************************************/

	.code 32
	.fpu neon
	.align 4
	.globl	memset32_neon
	.func

	/* r0 = buffer, r1 = value, r2 = times to write */
memset32_neon:
	cmp		r2, #1
	streq		r1, [r0], #4
	bxeq		lr

	cmp		r2, #4
	bgt		memset32_neon_start
	cmp		r2, #0
	bxeq		lr
memset32_neon_small:
	str		r1, [r0], #4
	subs		r2, r2, #1
	bne		memset32_neon_small
	bx		lr
memset32_neon_start:
	cmp		r2, #16
	blt		memset32_dropthru
	vdup.32		q0, r1
	vmov		q1, q0
	cmp		r2, #32
	blt		memset32_16
	cmp		r2, #64
	blt		memset32_32
	cmp		r2, #128
	blt		memset32_64
memset32_128:
	movs		r12, r2, lsr #7
memset32_loop128:
	subs		r12, r12, #1
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	bne		memset32_loop128
	ands		r2, r2, #0x7f
	bxeq		lr
memset32_64:
	movs		r12, r2, lsr #6
	beq		memset32_32
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	ands		r2, r2, #0x3f
	bxeq		lr
memset32_32:
	movs		r12, r2, lsr #5
	beq		memset32_16
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
	ands		r2, r2, #0x1f
	bxeq		lr
memset32_16:
	movs		r12, r2, lsr #4
	beq		memset32_dropthru
	and		r2, r2, #0xf
	vst1.64		{q0, q1}, [r0]!
	vst1.64		{q0, q1}, [r0]!
memset32_dropthru:
	rsb		r2, r2, #15
	add		pc, pc, r2, lsl #2
	nop
	str		r1, [r0, #56]
	str		r1, [r0, #52]
	str		r1, [r0, #48]
	str		r1, [r0, #44]
	str		r1, [r0, #40]
	str		r1, [r0, #36]
	str		r1, [r0, #32]
	str		r1, [r0, #28]
	str		r1, [r0, #24]
	str		r1, [r0, #20]
	str		r1, [r0, #16]
	str		r1, [r0, #12]
	str		r1, [r0, #8]
	str		r1, [r0, #4]
	str		r1, [r0, #0]
	bx		lr

	.endfunc
	.end