aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-08-13 17:05:28 +0000
committerGravatar agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-08-13 17:05:28 +0000
commit21417a7b4b48297d5cfb58a31ceaa6d3a800329b (patch)
tree60c6ce1abb6f8c4875341cc27bc9741317d2435d
parent61a678a28d6ff9cd066d342e6641c40470ed822d (diff)
Add versions of memset16() and memset32() in ARM assembly.
In benchmarks here on Cortex A9 processors, this code runs 25-30% faster than the C equivalent. Patch by: Steve McIntyre (ARM) http://codereview.appspot.com/1973042 git-svn-id: http://skia.googlecode.com/svn/trunk@594 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--src/opts/memset.arm.S110
-rw-r--r--src/opts/opts_check_arm.cpp31
2 files changed, 141 insertions, 0 deletions
diff --git a/src/opts/memset.arm.S b/src/opts/memset.arm.S
new file mode 100644
index 0000000000..04a7027e23
--- /dev/null
+++ b/src/opts/memset.arm.S
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Changes:
+ * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
+ * Added small changes to the two functions to make them work on the
+ * specified number of 16- or 32-bit values rather than the original
+ * code which was specified as a count of bytes. More verbose comments
+ * to aid future maintenance.
+ */
+
+ .text
+ .align
+
+ .global arm_memset32
+ .type arm_memset32, %function
+ .global arm_memset16
+ .type arm_memset16, %function
+
+/*
+ * Optimized memset functions for ARM.
+ *
+ * void arm_memset16(uint16_t* dst, uint16_t value, int count);
+ * void arm_memset32(uint32_t* dst, uint32_t value, int count);
+ *
+ */
+arm_memset16:
+ .fnstart
+ push {lr}
+
+ /* expand the data to 32 bits */
+ orr r1, r1, lsl #16
+
+ /* align to 32 bits */
+ tst r0, #2
+ strneh r1, [r0], #2
+ subne r2, r2, #2
+
+ /* Multiply count by 2 - go from the number of 16-bit shorts
+ * to the number of bytes desired. */
+ mov r2, r2, lsl #1
+
+ /* Now jump into the main loop below. */
+ b .Lwork_32
+ .fnend
+
+arm_memset32:
+ .fnstart
+ push {lr}
+
+ /* Multiply count by 4 - go from the number of 32-bit words to
+ * the number of bytes desired. */
+ mov r2, r2, lsl #2
+
+.Lwork_32:
+ /* Set up registers ready for writing them out. */
+ mov ip, r1
+ mov lr, r1
+
+ /* Try to align the destination to a cache line. Assume 32
+ * byte (8 word) cache lines, it's the common case. */
+ rsb r3, r0, #0
+ ands r3, r3, #0x1C
+ beq .Laligned32
+ cmp r3, r2
+ andhi r3, r2, #0x1C
+ sub r2, r2, r3
+
+ /* (Optionally) write any unaligned leading bytes.
+ * (0-28 bytes, length in r3) */
+ movs r3, r3, lsl #28
+ stmcsia r0!, {r1, lr}
+ stmcsia r0!, {r1, lr}
+ stmmiia r0!, {r1, lr}
+ movs r3, r3, lsl #2
+ strcs r1, [r0], #4
+
+ /* Now quickly loop through the cache-aligned data. */
+.Laligned32:
+ mov r3, r1
+1: subs r2, r2, #32
+ stmhsia r0!, {r1,r3,ip,lr}
+ stmhsia r0!, {r1,r3,ip,lr}
+ bhs 1b
+ add r2, r2, #32
+
+ /* (Optionally) store any remaining trailing bytes.
+ * (0-30 bytes, length in r2) */
+ movs r2, r2, lsl #28
+ stmcsia r0!, {r1,r3,ip,lr}
+ stmmiia r0!, {r1,lr}
+ movs r2, r2, lsl #2
+ strcs r1, [r0], #4
+ strmih lr, [r0], #2
+
+ pop {pc}
+ .fnend
diff --git a/src/opts/opts_check_arm.cpp b/src/opts/opts_check_arm.cpp
new file mode 100644
index 0000000000..079e80f272
--- /dev/null
+++ b/src/opts/opts_check_arm.cpp
@@ -0,0 +1,31 @@
+/*
+ **
+ ** Copyright 2006-2010, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ **
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ **
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ */
+
+#include "SkUtils.h"
+
+extern "C" {
+ void arm_memset16(uint16_t* dst, uint16_t value, int count);
+ void arm_memset32(uint32_t* dst, uint32_t value, int count);
+}
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+ return arm_memset16;
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+ return arm_memset32;
+}