powerpc: faster implementation of long division modeled on that for IA32

test: add one test (2^64-1) / (2^32+3) to exercise a special case of this long division. git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2288 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
author: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2013-07-03 11:28:17 +0000
committer: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2013-07-03 11:28:17 +0000
commit: 14ae5ba40c3217f7410c377bf36e21509b01eb8f (patch)
tree: f1d57929fb4310b6e8bdf8bf2edb3718c1169be9 /runtime
parent: 67976ff28a03fc57690ad792fd5e515010f803a5 (diff)
5 files changed, 220 insertions, 70 deletions
diff --git a/runtime/powerpc/i64_sdiv.s b/runtime/powerpc/i64_sdiv.s
index f522506..411ad50 100644
--- a/runtime/powerpc/i64_sdiv.s
+++ b/runtime/powerpc/i64_sdiv.s
@@ -41,8 +41,10 @@
         .balign 16
         .globl __i64_sdiv
 __i64_sdiv:
-        mflr r11                # save return address
-        xor r12, r3, r5         # save sign of result in r12 (top bit)
+	mflr r0
+        stw r0, 4(r1)           # save return address in caller's frame
+        xor r0, r3, r5          # compute sign of result (top bit)
+	mtctr r0                # save it in CTR (why not?)
         srawi r0, r3, 31        # take absolute value of N
         xor r4, r4, r0          # (i.e.  N = N ^ r0 - r0,
         xor r3, r3, r0          #  where r0 = 0 if N >= 0 and r0 = -1 if N < 0)
@@ -54,12 +56,14 @@ __i64_sdiv:
         subfc r6, r0, r6
         subfe r5, r0, r5
         bl __i64_udivmod        # do unsigned division
-        mtlr r11                # restore return address
-        srawi r0, r12, 31       # apply expected sign to quotient
-        xor r8, r8, r0          # RES = Q if r12 >= 0, -Q if r12 < 0
-        xor r7, r7, r0
-        subfc r4, r0, r8
-        subfe r3, r0, r7
+        lwz r0, 4(r1)
+        mtlr r0                 # restore return address
+        mfctr r0
+        srawi r0, r0, 31        # apply expected sign to quotient
+        xor r6, r6, r0          # RES = Q if CTR >= 0, -Q if CTR < 0
+        xor r5, r5, r0
+        subfc r4, r0, r6
+        subfe r3, r0, r5
         blr
         .type __i64_sdiv, @function
         .size __i64_sdiv, .-__i64_sdiv
diff --git a/runtime/powerpc/i64_smod.s b/runtime/powerpc/i64_smod.s
index 320571d..df6bfd8 100644
--- a/runtime/powerpc/i64_smod.s
+++ b/runtime/powerpc/i64_smod.s
@@ -41,23 +41,28 @@
         .balign 16
         .globl __i64_smod
 __i64_smod:
-        mflr r11                # save return address
-        srawi r12, r3, 31       # save sign of result in r12 (sign of N)
-        xor r4, r4, r12         # and take absolute value of N
-        xor r3, r3, r12
-        subfc r4, r12, r4
-        subfe r3, r12, r3
+	mflr r0
+        stw r0, 4(r1)           # save return address in caller's frame
+	mtctr r3                # save sign of result in CTR (sign of N)
+        srawi r0, r3, 31        # take absolute value of N
+        xor r4, r4, r0          # (i.e.  N = N ^ r0 - r0,
+        xor r3, r3, r0          #  where r0 = 0 if N >= 0 and r0 = -1 if N < 0)
+        subfc r4, r0, r4
+        subfe r3, r0, r3
         srawi r0, r5, 31        # take absolute value of D
-        xor r6, r6, r0
+        xor r6, r6, r0          # (same trick)
         xor r5, r5, r0
         subfc r6, r0, r6
         subfe r5, r0, r5
         bl __i64_udivmod        # do unsigned division
-        mtlr r11                # restore return address
-        xor r4, r4, r12         # apply expected sign to remainder
-        xor r3, r3, r12         # RES = R if r12 == 0, -R if r12 == -1
-        subfc r4, r12, r4
-        subfe r3, r12, r3
+        lwz r0, 4(r1)
+        mtlr r0                 # restore return address
+        mfctr r0
+        srawi r0, r0, 31        # apply expected sign to remainder
+        xor r4, r4, r0          # RES = R if CTR >= 0, -Q if CTR < 0
+        xor r3, r3, r0
+        subfc r4, r0, r4
+        subfe r3, r0, r3
         blr
         .type __i64_smod, @function
         .size __i64_smod, .-__i64_smod
diff --git a/runtime/powerpc/i64_udiv.s b/runtime/powerpc/i64_udiv.s
index 7bca760..9443d59 100644
--- a/runtime/powerpc/i64_udiv.s
+++ b/runtime/powerpc/i64_udiv.s
@@ -41,11 +41,13 @@
         .balign 16
         .globl __i64_udiv
 __i64_udiv:
-        mflr r11                # save return address in r11
+	mflr r0
+        stw r0, 4(r1)           # save return address in caller's frame
         bl __i64_udivmod        # unsigned divide
-        mtlr r11                # restore return address
-        mr r3, r7               # R = quotient
-        mr r4, r8
+        lwz r0, 4(r1)
+        mtlr r0                 # restore return address
+        mr r3, r5               # result = quotient
+        mr r4, r6
         blr
         .type __i64_udiv, @function
         .size __i64_udiv, .-__i64_udiv
diff --git a/runtime/powerpc/i64_udivmod.s b/runtime/powerpc/i64_udivmod.s
index 892e9a0..826d989 100644
--- a/runtime/powerpc/i64_udivmod.s
+++ b/runtime/powerpc/i64_udivmod.s
@@ -42,54 +42,193 @@
 # unsigned 64-bit integers.
 
 # Input:  numerator N in (r3,r4), divisor D in (r5,r6)
-# Output: quotient Q in (r7,r8),  remainder R in (r3,r4)
-# Locals: mask M in (r9,r10)	
+# Output: quotient Q in (r5,r6),  remainder R in (r3,r4)
+# Destroys: all integer caller-save registers
 	
 	.globl  __i64_udivmod
 	.balign 16
 __i64_udivmod:
-   # Set up quotient and mask
-        li r8, 0                # Q = 0
-        li r7, 0
-        li r10, 1               # M = 1
-        li r9, 0
-   # Check for zero divisor
-        or. r0, r6, r5
-        beqlr                   # return with unspecified quotient & remainder
-   # Scale divisor and mask
-1:      cmpwi r5, 0             # while top bit of D is zero...
-        blt 2f
-        subfc r0, r6, r4        # compute borrow out of N - D
-        subfe r0, r5, r3
-        subfe. r0, r0, r0       # EQ iff no borrow iff N >= D
-        bne 2f                  # ... and while N >= D ...
-        addc r6, r6, r6         # scale divisor: D = D << 1
-        adde r5, r5, r5
-        addc r10, r10, r10      # scale mask: M = M << 1
-        adde r9, r9, r9
-        b 1b                    # end while
-   # Long division
-2:      subfc r4, r6, r4        # Q = Q | M, N = N - D, and compute borrow
- 	or r8, r8, r10
-        subfe r3, r5, r3
-        or r7, r7, r9
-        subfe. r0, r0, r0       # test borrow
-        beq 3f                  # no borrow: N >= D, continue
-        addc r4, r4, r6         # borrow: undo what we just did to N and Q
-        andc r8, r8, r10
-        adde r3, r3, r5
-        andc r7, r7, r9
-3:      slwi r0, r9, 31         # unscale mask: M = M >> 1
-        srwi r10, r10, 1
-        or r10, r10, r0
-        srwi r9, r9, 1
-        slwi r0, r5, 31         # unscale divisor: D = D >> 1
-        srwi r6, r6, 1
-        or r6, r6, r0
-        srwi r5, r5, 1
-        or. r0, r10, r9         # iterate while M != 0
-        bne 2b
+        cmplwi  r5, 0           # DH == 0 ?
+	stwu    r1, -32(r1)
+        mflr    r0
+        stw     r0, 8(r1)
+        stw     r31, 12(r1)
+        beq     1f
+# The general case	
+        stw     r30, 16(r1)
+        stw     r29, 20(r1)
+        stw     r28, 24(r1)
+        mr      r28, r3         # Save N in (r28, r29)
+        mr      r29, r4
+        mr      r30, r5         # Save D in (r30, r31)
+        mr      r31, r6 
+  # Scale N and D down, giving N' and D', such that 2^31 <= D' < 2^32
+        cntlzw  r7, r5          # r7 = leading zeros in DH = 32 - shift amount
+        subfic  r8, r7, 32      # r8 = shift amount
+	slw     r0, r3, r7      # N' = N >> shift amount
+        srw     r3, r3, r8
+        srw     r4, r4, r8
+        or      r4, r4, r0
+	slw     r0, r5, r7      # D' = D >> shift amount
+        srw     r6, r6, r8
+        or      r5, r6, r0
+  # Divide N' by D' to get an approximate quotient Q
+        bl      __i64_udiv6432  # r3 = quotient, r4 = remainder
+        mr      r6, r3          # low half of quotient Q
+        li      r5, 0           # high half of quotient is 0
+  # Tentative quotient is either correct or one too high
+  # Compute Q * D in (r7, r8)
+4:      mullw   r7, r6, r30     # r7 = Q * DH
+        mullw   r8, r6, r31     # r8 = low 32 bits of Q * DL
+        mulhwu  r0, r6, r31     # r0 = high 32 bits of Q * DL
+        addc    r7, r7, r0
+        subfe.  r0, r0, r0      # test carry: EQ iff carry
+        beq     2f              # handle overflow case
+   # Compute R = N - Q * D, with borrow
+        subfc   r4, r8, r29
+        subfe   r3, r7, r28
+        subfe.  r0, r0, r0      # test borrow: EQ iff no borrow
+        beq     3f              # no borrow: N >= Q * D, we are good
+        addi    r6, r6, -1      # borrow: adjust Q down by 1
+        addc    r4, r4, r31     # and R up by D
+        adde    r3, r3, r30
+   # Finished	
+3:      lwz     r0, 8(r1)
+        mtlr    r0
+        lwz     r31, 12(r1)
+        lwz     r30, 16(r1)
+        lwz     r29, 20(r1)
+        lwz     r28, 24(r1)
+        addi    r1, r1, 32
         blr
-
+   # Special case when Q * D overflows
+2:      addi    r6, r6, -1      # adjust Q down by 1
+        b       4b              # and redo computation and check of remainder
+	.balign 16
+# Special case 64 bits divided by 32 bits
+1:      cmplwi  r3, 0           # NH == 0?
+        beq     4f
+        divwu   r31, r3, r6     # Divide NH by DL, quotient QH in r31
+        mullw   r0, r31, r6
+        subf    r3, r0, r3      # NH is remainder of this division
+        mr      r5, r6
+        bl      __i64_udiv6432  # divide NH : NL by DL
+        mr      r5, r31         # high word of quotient
+        mr      r6, r3          # low word of quotient
+                                # r4 contains low word of remainder
+        li      r3, 0           # high word of remainder = 0
+	lwz     r0, 8(r1)
+        mtlr    r0
+        lwz     r31, 12(r1)
+        addi    r1, r1, 32
+        blr
+	.balign 16
+# Special case 32 bits divided by 32 bits
+4:      mr      r0, r6
+	divwu   r6, r4, r6      # low word of quotient
+	li      r5, 0           # high word of quotient is 0
+        mullw   r0, r6, r0
+        subf    r4, r0, r4      # low word of remainder
+        li      r3, 0           # high word of remainder is 0
+        addi    r1, r1, 32
+        blr
+        
         .type __i64_udivmod, @function
         .size __i64_udivmod, .-__i64_udivmod
+
+# Auxiliary division function: 64 bit integer divided by 32 bit integer	
+# Not exported
+# Input:  numerator N in (r3,r4), divisor D in r5
+# Output: quotient Q in r3, remainder R in r4
+# Destroys: all integer caller-save registers
+# Assumes: high word of N is less than D	
+        
+	.balign 16
+__i64_udiv6432: 
+# Algorithm 9.3 from Hacker's Delight, section 9.4	
+# Initially: u1 in r3, u0 in r4, v in r5
+#   s = __builtin_clz(v);
+	cntlzw  r6, r5          # s in r6
+#   v = v << s;
+        slw     r5, r5, r6
+#   vn1 = v >> 16;              # vn1 in r7
+        srwi    r7, r5, 16
+#   vn0 = v & 0xFFFF;           # vn0 in r8
+        rlwinm  r8, r5, 0, 16, 31
+#   un32 = (u1 << s) | (u0 >> 32 - s);
+	subfic  r0, r6, 32
+        srw     r0, r4, r0
+        slw     r3, r3, r6     # u1 dies, un32 in r3
+        or      r3, r3, r0
+#   un10 = u0 << s;
+        slw     r4, r4, r6     # u0 dies, un10 in r4
+#   un1 = un10 >> 16;
+        srwi    r9, r4, 16     # un1 in r9
+#   un0 = un10 & 0xFFFF;
+        rlwinm  r4, r4, 0, 16, 31 # un10 dies, un0 in r4
+#   q1 = un32/vn1;
+        divwu   r10, r3, r7    # q in r10
+#   rhat = un32 - q1*vn1;
+        mullw   r0, r10, r7
+        subf    r11, r0, r3    # rhat in r11
+#  again1:
+1:      
+#   if (q1 >= b || q1*vn0 > b*rhat + un1) {
+        cmplwi  r10, 0xFFFF
+        bgt     2f
+        mullw   r0, r10, r8
+        slwi    r12, r11, 16
+        add     r12, r12, r9
+        cmplw   r0, r12
+        ble     3f
+2:      
+#     q1 = q1 - 1;
+        addi    r10, r10, -1
+#     rhat = rhat + vn1;
+        add     r11, r11, r7
+#     if (rhat < b) goto again1;}
+        cmplwi  r11, 0xFFFF
+        ble     1b
+3:      
+#   un21 = un32*b + un1 - q1*v;
+        slwi    r0, r3, 16     # un32 dies
+        add     r9, r0, r9     # un1 dies
+        mullw   r0, r10, r5
+        subf    r9, r0, r9     # un21 in r9
+#   q0 = un21/vn1;
+	divwu   r3, r9, r7     # q0 in r3
+#   rhat = un21 - q0*vn1;
+        mullw   r0, r3, r7
+        subf    r11, r0, r9    # rhat in r11
+# again2:
+4:      
+#   if (q0 >= b || q0*vn0 > b*rhat + un0) {
+        cmplwi  r3, 0xFFFF
+        bgt     5f
+        mullw   r0, r3, r8
+        slwi    r12, r11, 16
+        add     r12, r12, r4
+        cmplw   r0, r12
+        ble     6f
+5:      
+#     q0 = q0 - 1;
+	addi    r3, r3, -1
+#     rhat = rhat + vn1;
+        add     r11, r11, r7
+#     if (rhat < b) goto again2;}
+	cmplwi  r11, 0xFFFF
+        ble     4b
+6:      
+#   remainder = (un21*b + un0 - q0*v) >> s;
+        slwi    r0, r9, 16
+        add     r4, r0, r4  # un0 dies, remainder in r4
+        mullw   r0, r3, r5
+        subf    r4, r0, r4
+        srw     r4, r4, r6
+#   quotient = q1*b + q0;
+        slwi    r0, r10, 16
+        add     r3, r0, r3
+        blr
+
+	.type	__i64_udiv6432, @function
+	.size	__i64_udiv6432,.-__i64_udiv6432
diff --git a/runtime/test/test_int64.c b/runtime/test/test_int64.c
index 2aa2111..ab7a231 100644
--- a/runtime/test/test_int64.c
+++ b/runtime/test/test_int64.c
@@ -181,11 +181,11 @@ static void test1(u64 x, u64 y)
     error++, printf("(s64) %a = %lld, expected %lld\n", f, z, (s64) f);
 }
 
-#define NSPECIFIC 8
+#define NSPECIFIC 9
 
 unsigned long long specific[NSPECIFIC] = {
   0, 1, -1, 0x7FFFFFFFULL, 0x80000000ULL, 0xFFFFFFFFULL,
-  0x7FFFFFFFFFFFULL, 0x8000000000000000ULL
+  0x7FFFFFFFFFFFULL, 0x8000000000000000ULL, 0x100000003ULL
 };
 
 int main()
author	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2013-07-03 11:28:17 +0000
committer	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2013-07-03 11:28:17 +0000
commit	14ae5ba40c3217f7410c377bf36e21509b01eb8f (patch)
tree	f1d57929fb4310b6e8bdf8bf2edb3718c1169be9 /runtime
parent	67976ff28a03fc57690ad792fd5e515010f803a5 (diff)