/***************************************************************************
 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 ***************************************************************************/

/***************************************************************************
  Neon memset: Attempts to do a memset with Neon registers if possible,
     Inputs:
        s: The buffer to write to
        c: The integer data to write to the buffer
        n: The size_t count.
     Outputs:

***************************************************************************/

        .code 32
        .fpu neon
        .align 4
        .globl memset16_neon
        .func

memset16_neon:
        cmp             r2, #0
        bxeq            lr

        /* Keep in mind that r2 -- the count argument -- is for the
         * number of 16-bit items to copy.
         */
        lsl             r2, r2, #1

        push            {r0}

        /* If we have < 8 bytes, just do a quick loop to handle that */
        cmp             r2, #8
        bgt             memset_gt4
memset_smallcopy_loop:
        strh            r1, [r0], #2
        subs            r2, r2, #2
        bne             memset_smallcopy_loop
memset_smallcopy_done:
        pop             {r0}
        bx              lr

memset_gt4:
        /*
         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
         * a register with two 16-bit-values we can copy. We do this by
         * duplicating lowest 16-bits of r1 to upper 16-bits.
         */
        orr             r1, r1, r1, lsl #16
        /*
         * If we're copying > 64 bytes, then we may want to get
         * onto a 16-byte boundary to improve speed even more.
         */
        cmp             r2, #64
        blt             memset_route
        ands            r12, r0, #0xf
        beq             memset_route
        /*
         * Determine the number of bytes to move forward to get to the 16-byte
         * boundary.  Note that this will be a multiple of 4, since we
         * already are word-aligned.
         */
        rsb             r12, r12, #16
        sub             r2, r2, r12
        lsls            r12, r12, #29
        strmi           r1, [r0], #4
        strcs           r1, [r0], #4
        strcs           r1, [r0], #4
        lsls            r12, r12, #2
        strcsh          r1, [r0], #2
memset_route:
        /*
         * Decide where to route for the maximum copy sizes.  Note that we
         * build q0 and q1 depending on if we'll need it, so that's
         * interwoven here as well.
         */
        vdup.u32        d0, r1
        cmp             r2, #16
        blt             memset_8
        vmov            d1, d0
        cmp             r2, #64
        blt             memset_16
        vmov            q1, q0
        cmp             r2, #128
        blt             memset_32
memset_128:
        mov             r12, r2, lsr #7
memset_128_loop:
        vst1.64         {q0, q1}, [r0]!
        vst1.64         {q0, q1}, [r0]!
        vst1.64         {q0, q1}, [r0]!
        vst1.64         {q0, q1}, [r0]!
        subs            r12, r12, #1
        bne             memset_128_loop
        ands            r2, r2, #0x7f
        beq             memset_end
memset_32:
        movs            r12, r2, lsr #5
        beq             memset_16
memset_32_loop:
        subs            r12, r12, #1
        vst1.64         {q0, q1}, [r0]!
        bne             memset_32_loop
        ands            r2, r2, #0x1f
        beq             memset_end
memset_16:
        movs            r12, r2, lsr #4
        beq             memset_8
memset_16_loop:
        subs            r12, r12, #1
        vst1.32         {q0}, [r0]!
        bne             memset_16_loop
        ands            r2, r2, #0xf
        beq             memset_end
        /*
         * memset_8 isn't a loop, since we try to do our loops at 16
         * bytes and above.  We should loop there, then drop down here
         * to finish the <16-byte versions.  Same for memset_4 and
         * memset_1.
         */
memset_8:
        cmp             r2, #8
        blt             memset_4
        subs            r2, r2, #8
        vst1.32         {d0}, [r0]!
memset_4:
        cmp             r2, #4
        blt             memset_2
        subs            r2, r2, #4
        str             r1, [r0], #4
memset_2:
        cmp             r2, #0
        ble             memset_end
        strh            r1, [r0], #2
memset_end:
        pop             {r0}
        bx              lr

        .endfunc
        .end