diff options
author | agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-08-13 17:05:28 +0000 |
---|---|---|
committer | agl@chromium.org <agl@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-08-13 17:05:28 +0000 |
commit | 21417a7b4b48297d5cfb58a31ceaa6d3a800329b (patch) | |
tree | 60c6ce1abb6f8c4875341cc27bc9741317d2435d | |
parent | 61a678a28d6ff9cd066d342e6641c40470ed822d (diff) |
Add versions of memset16() and memset32() in ARM assembly.
In benchmarks here on Cortex A9 processors, this code runs 25-30% faster
than the C equivalent.
Patch by: Steve McIntyre (ARM)
http://codereview.appspot.com/1973042
git-svn-id: http://skia.googlecode.com/svn/trunk@594 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r-- | src/opts/memset.arm.S | 110 | ||||
-rw-r--r-- | src/opts/opts_check_arm.cpp | 31 |
2 files changed, 141 insertions, 0 deletions
diff --git a/src/opts/memset.arm.S b/src/opts/memset.arm.S new file mode 100644 index 0000000000..04a7027e23 --- /dev/null +++ b/src/opts/memset.arm.S @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Changes: + * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com> + * Added small changes to the two functions to make them work on the + * specified number of 16- or 32-bit values rather than the original + * code which was specified as a count of bytes. More verbose comments + * to aid future maintenance. + */ + + .text + .align + + .global arm_memset32 + .type arm_memset32, %function + .global arm_memset16 + .type arm_memset16, %function + +/* + * Optimized memset functions for ARM. + * + * void arm_memset16(uint16_t* dst, uint16_t value, int count); + * void arm_memset32(uint32_t* dst, uint32_t value, int count); + * + */ +arm_memset16: + .fnstart + push {lr} + + /* expand the data to 32 bits */ + orr r1, r1, lsl #16 + + /* align to 32 bits */ + tst r0, #2 + strneh r1, [r0], #2 + subne r2, r2, #2 + + /* Multiply count by 2 - go from the number of 16-bit shorts + * to the number of bytes desired. */ + mov r2, r2, lsl #1 + + /* Now jump into the main loop below. */ + b .Lwork_32 + .fnend + +arm_memset32: + .fnstart + push {lr} + + /* Multiply count by 4 - go from the number of 32-bit words to + * the number of bytes desired. */ + mov r2, r2, lsl #2 + +.Lwork_32: + /* Set up registers ready for writing them out. */ + mov ip, r1 + mov lr, r1 + + /* Try to align the destination to a cache line. Assume 32 + * byte (8 word) cache lines, it's the common case. */ + rsb r3, r0, #0 + ands r3, r3, #0x1C + beq .Laligned32 + cmp r3, r2 + andhi r3, r2, #0x1C + sub r2, r2, r3 + + /* (Optionally) write any unaligned leading bytes. + * (0-28 bytes, length in r3) */ + movs r3, r3, lsl #28 + stmcsia r0!, {r1, lr} + stmcsia r0!, {r1, lr} + stmmiia r0!, {r1, lr} + movs r3, r3, lsl #2 + strcs r1, [r0], #4 + + /* Now quickly loop through the cache-aligned data. */ +.Laligned32: + mov r3, r1 +1: subs r2, r2, #32 + stmhsia r0!, {r1,r3,ip,lr} + stmhsia r0!, {r1,r3,ip,lr} + bhs 1b + add r2, r2, #32 + + /* (Optionally) store any remaining trailing bytes. + * (0-30 bytes, length in r2) */ + movs r2, r2, lsl #28 + stmcsia r0!, {r1,r3,ip,lr} + stmmiia r0!, {r1,lr} + movs r2, r2, lsl #2 + strcs r1, [r0], #4 + strmih lr, [r0], #2 + + pop {pc} + .fnend diff --git a/src/opts/opts_check_arm.cpp b/src/opts/opts_check_arm.cpp new file mode 100644 index 0000000000..079e80f272 --- /dev/null +++ b/src/opts/opts_check_arm.cpp @@ -0,0 +1,31 @@ +/* + ** + ** Copyright 2006-2010, The Android Open Source Project + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +#include "SkUtils.h" + +extern "C" { + void arm_memset16(uint16_t* dst, uint16_t value, int count); + void arm_memset32(uint32_t* dst, uint32_t value, int count); +} + +SkMemset16Proc SkMemset16GetPlatformProc() { + return arm_memset16; +} + +SkMemset32Proc SkMemset32GetPlatformProc() { + return arm_memset32; +} |