From 14ae5ba40c3217f7410c377bf36e21509b01eb8f Mon Sep 17 00:00:00 2001 From: xleroy Date: Wed, 3 Jul 2013 11:28:17 +0000 Subject: powerpc: faster implementation of long division modeled on that for IA32 test: add one test (2^64-1) / (2^32+3) to exercise a special case of this long division. git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2288 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e --- runtime/powerpc/i64_sdiv.s | 20 ++-- runtime/powerpc/i64_smod.s | 29 +++--- runtime/powerpc/i64_udiv.s | 10 +- runtime/powerpc/i64_udivmod.s | 227 ++++++++++++++++++++++++++++++++++-------- runtime/test/test_int64.c | 4 +- 5 files changed, 220 insertions(+), 70 deletions(-) (limited to 'runtime') diff --git a/runtime/powerpc/i64_sdiv.s b/runtime/powerpc/i64_sdiv.s index f522506..411ad50 100644 --- a/runtime/powerpc/i64_sdiv.s +++ b/runtime/powerpc/i64_sdiv.s @@ -41,8 +41,10 @@ .balign 16 .globl __i64_sdiv __i64_sdiv: - mflr r11 # save return address - xor r12, r3, r5 # save sign of result in r12 (top bit) + mflr r0 + stw r0, 4(r1) # save return address in caller's frame + xor r0, r3, r5 # compute sign of result (top bit) + mtctr r0 # save it in CTR (why not?) srawi r0, r3, 31 # take absolute value of N xor r4, r4, r0 # (i.e. N = N ^ r0 - r0, xor r3, r3, r0 # where r0 = 0 if N >= 0 and r0 = -1 if N < 0) @@ -54,12 +56,14 @@ __i64_sdiv: subfc r6, r0, r6 subfe r5, r0, r5 bl __i64_udivmod # do unsigned division - mtlr r11 # restore return address - srawi r0, r12, 31 # apply expected sign to quotient - xor r8, r8, r0 # RES = Q if r12 >= 0, -Q if r12 < 0 - xor r7, r7, r0 - subfc r4, r0, r8 - subfe r3, r0, r7 + lwz r0, 4(r1) + mtlr r0 # restore return address + mfctr r0 + srawi r0, r0, 31 # apply expected sign to quotient + xor r6, r6, r0 # RES = Q if CTR >= 0, -Q if CTR < 0 + xor r5, r5, r0 + subfc r4, r0, r6 + subfe r3, r0, r5 blr .type __i64_sdiv, @function .size __i64_sdiv, .-__i64_sdiv diff --git a/runtime/powerpc/i64_smod.s b/runtime/powerpc/i64_smod.s index 320571d..df6bfd8 100644 --- a/runtime/powerpc/i64_smod.s +++ b/runtime/powerpc/i64_smod.s @@ -41,23 +41,28 @@ .balign 16 .globl __i64_smod __i64_smod: - mflr r11 # save return address - srawi r12, r3, 31 # save sign of result in r12 (sign of N) - xor r4, r4, r12 # and take absolute value of N - xor r3, r3, r12 - subfc r4, r12, r4 - subfe r3, r12, r3 + mflr r0 + stw r0, 4(r1) # save return address in caller's frame + mtctr r3 # save sign of result in CTR (sign of N) + srawi r0, r3, 31 # take absolute value of N + xor r4, r4, r0 # (i.e. N = N ^ r0 - r0, + xor r3, r3, r0 # where r0 = 0 if N >= 0 and r0 = -1 if N < 0) + subfc r4, r0, r4 + subfe r3, r0, r3 srawi r0, r5, 31 # take absolute value of D - xor r6, r6, r0 + xor r6, r6, r0 # (same trick) xor r5, r5, r0 subfc r6, r0, r6 subfe r5, r0, r5 bl __i64_udivmod # do unsigned division - mtlr r11 # restore return address - xor r4, r4, r12 # apply expected sign to remainder - xor r3, r3, r12 # RES = R if r12 == 0, -R if r12 == -1 - subfc r4, r12, r4 - subfe r3, r12, r3 + lwz r0, 4(r1) + mtlr r0 # restore return address + mfctr r0 + srawi r0, r0, 31 # apply expected sign to remainder + xor r4, r4, r0 # RES = R if CTR >= 0, -Q if CTR < 0 + xor r3, r3, r0 + subfc r4, r0, r4 + subfe r3, r0, r3 blr .type __i64_smod, @function .size __i64_smod, .-__i64_smod diff --git a/runtime/powerpc/i64_udiv.s b/runtime/powerpc/i64_udiv.s index 7bca760..9443d59 100644 --- a/runtime/powerpc/i64_udiv.s +++ b/runtime/powerpc/i64_udiv.s @@ -41,11 +41,13 @@ .balign 16 .globl __i64_udiv __i64_udiv: - mflr r11 # save return address in r11 + mflr r0 + stw r0, 4(r1) # save return address in caller's frame bl __i64_udivmod # unsigned divide - mtlr r11 # restore return address - mr r3, r7 # R = quotient - mr r4, r8 + lwz r0, 4(r1) + mtlr r0 # restore return address + mr r3, r5 # result = quotient + mr r4, r6 blr .type __i64_udiv, @function .size __i64_udiv, .-__i64_udiv diff --git a/runtime/powerpc/i64_udivmod.s b/runtime/powerpc/i64_udivmod.s index 892e9a0..826d989 100644 --- a/runtime/powerpc/i64_udivmod.s +++ b/runtime/powerpc/i64_udivmod.s @@ -42,54 +42,193 @@ # unsigned 64-bit integers. # Input: numerator N in (r3,r4), divisor D in (r5,r6) -# Output: quotient Q in (r7,r8), remainder R in (r3,r4) -# Locals: mask M in (r9,r10) +# Output: quotient Q in (r5,r6), remainder R in (r3,r4) +# Destroys: all integer caller-save registers .globl __i64_udivmod .balign 16 __i64_udivmod: - # Set up quotient and mask - li r8, 0 # Q = 0 - li r7, 0 - li r10, 1 # M = 1 - li r9, 0 - # Check for zero divisor - or. r0, r6, r5 - beqlr # return with unspecified quotient & remainder - # Scale divisor and mask -1: cmpwi r5, 0 # while top bit of D is zero... - blt 2f - subfc r0, r6, r4 # compute borrow out of N - D - subfe r0, r5, r3 - subfe. r0, r0, r0 # EQ iff no borrow iff N >= D - bne 2f # ... and while N >= D ... - addc r6, r6, r6 # scale divisor: D = D << 1 - adde r5, r5, r5 - addc r10, r10, r10 # scale mask: M = M << 1 - adde r9, r9, r9 - b 1b # end while - # Long division -2: subfc r4, r6, r4 # Q = Q | M, N = N - D, and compute borrow - or r8, r8, r10 - subfe r3, r5, r3 - or r7, r7, r9 - subfe. r0, r0, r0 # test borrow - beq 3f # no borrow: N >= D, continue - addc r4, r4, r6 # borrow: undo what we just did to N and Q - andc r8, r8, r10 - adde r3, r3, r5 - andc r7, r7, r9 -3: slwi r0, r9, 31 # unscale mask: M = M >> 1 - srwi r10, r10, 1 - or r10, r10, r0 - srwi r9, r9, 1 - slwi r0, r5, 31 # unscale divisor: D = D >> 1 - srwi r6, r6, 1 - or r6, r6, r0 - srwi r5, r5, 1 - or. r0, r10, r9 # iterate while M != 0 - bne 2b + cmplwi r5, 0 # DH == 0 ? + stwu r1, -32(r1) + mflr r0 + stw r0, 8(r1) + stw r31, 12(r1) + beq 1f +# The general case + stw r30, 16(r1) + stw r29, 20(r1) + stw r28, 24(r1) + mr r28, r3 # Save N in (r28, r29) + mr r29, r4 + mr r30, r5 # Save D in (r30, r31) + mr r31, r6 + # Scale N and D down, giving N' and D', such that 2^31 <= D' < 2^32 + cntlzw r7, r5 # r7 = leading zeros in DH = 32 - shift amount + subfic r8, r7, 32 # r8 = shift amount + slw r0, r3, r7 # N' = N >> shift amount + srw r3, r3, r8 + srw r4, r4, r8 + or r4, r4, r0 + slw r0, r5, r7 # D' = D >> shift amount + srw r6, r6, r8 + or r5, r6, r0 + # Divide N' by D' to get an approximate quotient Q + bl __i64_udiv6432 # r3 = quotient, r4 = remainder + mr r6, r3 # low half of quotient Q + li r5, 0 # high half of quotient is 0 + # Tentative quotient is either correct or one too high + # Compute Q * D in (r7, r8) +4: mullw r7, r6, r30 # r7 = Q * DH + mullw r8, r6, r31 # r8 = low 32 bits of Q * DL + mulhwu r0, r6, r31 # r0 = high 32 bits of Q * DL + addc r7, r7, r0 + subfe. r0, r0, r0 # test carry: EQ iff carry + beq 2f # handle overflow case + # Compute R = N - Q * D, with borrow + subfc r4, r8, r29 + subfe r3, r7, r28 + subfe. r0, r0, r0 # test borrow: EQ iff no borrow + beq 3f # no borrow: N >= Q * D, we are good + addi r6, r6, -1 # borrow: adjust Q down by 1 + addc r4, r4, r31 # and R up by D + adde r3, r3, r30 + # Finished +3: lwz r0, 8(r1) + mtlr r0 + lwz r31, 12(r1) + lwz r30, 16(r1) + lwz r29, 20(r1) + lwz r28, 24(r1) + addi r1, r1, 32 blr - + # Special case when Q * D overflows +2: addi r6, r6, -1 # adjust Q down by 1 + b 4b # and redo computation and check of remainder + .balign 16 +# Special case 64 bits divided by 32 bits +1: cmplwi r3, 0 # NH == 0? + beq 4f + divwu r31, r3, r6 # Divide NH by DL, quotient QH in r31 + mullw r0, r31, r6 + subf r3, r0, r3 # NH is remainder of this division + mr r5, r6 + bl __i64_udiv6432 # divide NH : NL by DL + mr r5, r31 # high word of quotient + mr r6, r3 # low word of quotient + # r4 contains low word of remainder + li r3, 0 # high word of remainder = 0 + lwz r0, 8(r1) + mtlr r0 + lwz r31, 12(r1) + addi r1, r1, 32 + blr + .balign 16 +# Special case 32 bits divided by 32 bits +4: mr r0, r6 + divwu r6, r4, r6 # low word of quotient + li r5, 0 # high word of quotient is 0 + mullw r0, r6, r0 + subf r4, r0, r4 # low word of remainder + li r3, 0 # high word of remainder is 0 + addi r1, r1, 32 + blr + .type __i64_udivmod, @function .size __i64_udivmod, .-__i64_udivmod + +# Auxiliary division function: 64 bit integer divided by 32 bit integer +# Not exported +# Input: numerator N in (r3,r4), divisor D in r5 +# Output: quotient Q in r3, remainder R in r4 +# Destroys: all integer caller-save registers +# Assumes: high word of N is less than D + + .balign 16 +__i64_udiv6432: +# Algorithm 9.3 from Hacker's Delight, section 9.4 +# Initially: u1 in r3, u0 in r4, v in r5 +# s = __builtin_clz(v); + cntlzw r6, r5 # s in r6 +# v = v << s; + slw r5, r5, r6 +# vn1 = v >> 16; # vn1 in r7 + srwi r7, r5, 16 +# vn0 = v & 0xFFFF; # vn0 in r8 + rlwinm r8, r5, 0, 16, 31 +# un32 = (u1 << s) | (u0 >> 32 - s); + subfic r0, r6, 32 + srw r0, r4, r0 + slw r3, r3, r6 # u1 dies, un32 in r3 + or r3, r3, r0 +# un10 = u0 << s; + slw r4, r4, r6 # u0 dies, un10 in r4 +# un1 = un10 >> 16; + srwi r9, r4, 16 # un1 in r9 +# un0 = un10 & 0xFFFF; + rlwinm r4, r4, 0, 16, 31 # un10 dies, un0 in r4 +# q1 = un32/vn1; + divwu r10, r3, r7 # q in r10 +# rhat = un32 - q1*vn1; + mullw r0, r10, r7 + subf r11, r0, r3 # rhat in r11 +# again1: +1: +# if (q1 >= b || q1*vn0 > b*rhat + un1) { + cmplwi r10, 0xFFFF + bgt 2f + mullw r0, r10, r8 + slwi r12, r11, 16 + add r12, r12, r9 + cmplw r0, r12 + ble 3f +2: +# q1 = q1 - 1; + addi r10, r10, -1 +# rhat = rhat + vn1; + add r11, r11, r7 +# if (rhat < b) goto again1;} + cmplwi r11, 0xFFFF + ble 1b +3: +# un21 = un32*b + un1 - q1*v; + slwi r0, r3, 16 # un32 dies + add r9, r0, r9 # un1 dies + mullw r0, r10, r5 + subf r9, r0, r9 # un21 in r9 +# q0 = un21/vn1; + divwu r3, r9, r7 # q0 in r3 +# rhat = un21 - q0*vn1; + mullw r0, r3, r7 + subf r11, r0, r9 # rhat in r11 +# again2: +4: +# if (q0 >= b || q0*vn0 > b*rhat + un0) { + cmplwi r3, 0xFFFF + bgt 5f + mullw r0, r3, r8 + slwi r12, r11, 16 + add r12, r12, r4 + cmplw r0, r12 + ble 6f +5: +# q0 = q0 - 1; + addi r3, r3, -1 +# rhat = rhat + vn1; + add r11, r11, r7 +# if (rhat < b) goto again2;} + cmplwi r11, 0xFFFF + ble 4b +6: +# remainder = (un21*b + un0 - q0*v) >> s; + slwi r0, r9, 16 + add r4, r0, r4 # un0 dies, remainder in r4 + mullw r0, r3, r5 + subf r4, r0, r4 + srw r4, r4, r6 +# quotient = q1*b + q0; + slwi r0, r10, 16 + add r3, r0, r3 + blr + + .type __i64_udiv6432, @function + .size __i64_udiv6432,.-__i64_udiv6432 diff --git a/runtime/test/test_int64.c b/runtime/test/test_int64.c index 2aa2111..ab7a231 100644 --- a/runtime/test/test_int64.c +++ b/runtime/test/test_int64.c @@ -181,11 +181,11 @@ static void test1(u64 x, u64 y) error++, printf("(s64) %a = %lld, expected %lld\n", f, z, (s64) f); } -#define NSPECIFIC 8 +#define NSPECIFIC 9 unsigned long long specific[NSPECIFIC] = { 0, 1, -1, 0x7FFFFFFFULL, 0x80000000ULL, 0xFFFFFFFFULL, - 0x7FFFFFFFFFFFULL, 0x8000000000000000ULL + 0x7FFFFFFFFFFFULL, 0x8000000000000000ULL, 0x100000003ULL }; int main() -- cgit v1.2.3