summaryrefslogtreecommitdiff
path: root/runtime
diff options
context:
space:
mode:
authorGravatar xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>2013-07-03 11:28:17 +0000
committerGravatar xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>2013-07-03 11:28:17 +0000
commit14ae5ba40c3217f7410c377bf36e21509b01eb8f (patch)
treef1d57929fb4310b6e8bdf8bf2edb3718c1169be9 /runtime
parent67976ff28a03fc57690ad792fd5e515010f803a5 (diff)
powerpc: faster implementation of long division modeled on that for IA32
test: add one test (2^64-1) / (2^32+3) to exercise a special case of this long division. git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2288 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
Diffstat (limited to 'runtime')
-rw-r--r--runtime/powerpc/i64_sdiv.s20
-rw-r--r--runtime/powerpc/i64_smod.s29
-rw-r--r--runtime/powerpc/i64_udiv.s10
-rw-r--r--runtime/powerpc/i64_udivmod.s227
-rw-r--r--runtime/test/test_int64.c4
5 files changed, 220 insertions, 70 deletions
diff --git a/runtime/powerpc/i64_sdiv.s b/runtime/powerpc/i64_sdiv.s
index f522506..411ad50 100644
--- a/runtime/powerpc/i64_sdiv.s
+++ b/runtime/powerpc/i64_sdiv.s
@@ -41,8 +41,10 @@
.balign 16
.globl __i64_sdiv
__i64_sdiv:
- mflr r11 # save return address
- xor r12, r3, r5 # save sign of result in r12 (top bit)
+ mflr r0
+ stw r0, 4(r1) # save return address in caller's frame
+ xor r0, r3, r5 # compute sign of result (top bit)
+ mtctr r0 # save it in CTR (why not?)
srawi r0, r3, 31 # take absolute value of N
xor r4, r4, r0 # (i.e. N = N ^ r0 - r0,
xor r3, r3, r0 # where r0 = 0 if N >= 0 and r0 = -1 if N < 0)
@@ -54,12 +56,14 @@ __i64_sdiv:
subfc r6, r0, r6
subfe r5, r0, r5
bl __i64_udivmod # do unsigned division
- mtlr r11 # restore return address
- srawi r0, r12, 31 # apply expected sign to quotient
- xor r8, r8, r0 # RES = Q if r12 >= 0, -Q if r12 < 0
- xor r7, r7, r0
- subfc r4, r0, r8
- subfe r3, r0, r7
+ lwz r0, 4(r1)
+ mtlr r0 # restore return address
+ mfctr r0
+ srawi r0, r0, 31 # apply expected sign to quotient
+ xor r6, r6, r0 # RES = Q if CTR >= 0, -Q if CTR < 0
+ xor r5, r5, r0
+ subfc r4, r0, r6
+ subfe r3, r0, r5
blr
.type __i64_sdiv, @function
.size __i64_sdiv, .-__i64_sdiv
diff --git a/runtime/powerpc/i64_smod.s b/runtime/powerpc/i64_smod.s
index 320571d..df6bfd8 100644
--- a/runtime/powerpc/i64_smod.s
+++ b/runtime/powerpc/i64_smod.s
@@ -41,23 +41,28 @@
.balign 16
.globl __i64_smod
__i64_smod:
- mflr r11 # save return address
- srawi r12, r3, 31 # save sign of result in r12 (sign of N)
- xor r4, r4, r12 # and take absolute value of N
- xor r3, r3, r12
- subfc r4, r12, r4
- subfe r3, r12, r3
+ mflr r0
+ stw r0, 4(r1) # save return address in caller's frame
+ mtctr r3 # save sign of result in CTR (sign of N)
+ srawi r0, r3, 31 # take absolute value of N
+ xor r4, r4, r0 # (i.e. N = N ^ r0 - r0,
+ xor r3, r3, r0 # where r0 = 0 if N >= 0 and r0 = -1 if N < 0)
+ subfc r4, r0, r4
+ subfe r3, r0, r3
srawi r0, r5, 31 # take absolute value of D
- xor r6, r6, r0
+ xor r6, r6, r0 # (same trick)
xor r5, r5, r0
subfc r6, r0, r6
subfe r5, r0, r5
bl __i64_udivmod # do unsigned division
- mtlr r11 # restore return address
- xor r4, r4, r12 # apply expected sign to remainder
- xor r3, r3, r12 # RES = R if r12 == 0, -R if r12 == -1
- subfc r4, r12, r4
- subfe r3, r12, r3
+ lwz r0, 4(r1)
+ mtlr r0 # restore return address
+ mfctr r0
+ srawi r0, r0, 31 # apply expected sign to remainder
+ xor r4, r4, r0 # RES = R if CTR >= 0, -Q if CTR < 0
+ xor r3, r3, r0
+ subfc r4, r0, r4
+ subfe r3, r0, r3
blr
.type __i64_smod, @function
.size __i64_smod, .-__i64_smod
diff --git a/runtime/powerpc/i64_udiv.s b/runtime/powerpc/i64_udiv.s
index 7bca760..9443d59 100644
--- a/runtime/powerpc/i64_udiv.s
+++ b/runtime/powerpc/i64_udiv.s
@@ -41,11 +41,13 @@
.balign 16
.globl __i64_udiv
__i64_udiv:
- mflr r11 # save return address in r11
+ mflr r0
+ stw r0, 4(r1) # save return address in caller's frame
bl __i64_udivmod # unsigned divide
- mtlr r11 # restore return address
- mr r3, r7 # R = quotient
- mr r4, r8
+ lwz r0, 4(r1)
+ mtlr r0 # restore return address
+ mr r3, r5 # result = quotient
+ mr r4, r6
blr
.type __i64_udiv, @function
.size __i64_udiv, .-__i64_udiv
diff --git a/runtime/powerpc/i64_udivmod.s b/runtime/powerpc/i64_udivmod.s
index 892e9a0..826d989 100644
--- a/runtime/powerpc/i64_udivmod.s
+++ b/runtime/powerpc/i64_udivmod.s
@@ -42,54 +42,193 @@
# unsigned 64-bit integers.
# Input: numerator N in (r3,r4), divisor D in (r5,r6)
-# Output: quotient Q in (r7,r8), remainder R in (r3,r4)
-# Locals: mask M in (r9,r10)
+# Output: quotient Q in (r5,r6), remainder R in (r3,r4)
+# Destroys: all integer caller-save registers
.globl __i64_udivmod
.balign 16
__i64_udivmod:
- # Set up quotient and mask
- li r8, 0 # Q = 0
- li r7, 0
- li r10, 1 # M = 1
- li r9, 0
- # Check for zero divisor
- or. r0, r6, r5
- beqlr # return with unspecified quotient & remainder
- # Scale divisor and mask
-1: cmpwi r5, 0 # while top bit of D is zero...
- blt 2f
- subfc r0, r6, r4 # compute borrow out of N - D
- subfe r0, r5, r3
- subfe. r0, r0, r0 # EQ iff no borrow iff N >= D
- bne 2f # ... and while N >= D ...
- addc r6, r6, r6 # scale divisor: D = D << 1
- adde r5, r5, r5
- addc r10, r10, r10 # scale mask: M = M << 1
- adde r9, r9, r9
- b 1b # end while
- # Long division
-2: subfc r4, r6, r4 # Q = Q | M, N = N - D, and compute borrow
- or r8, r8, r10
- subfe r3, r5, r3
- or r7, r7, r9
- subfe. r0, r0, r0 # test borrow
- beq 3f # no borrow: N >= D, continue
- addc r4, r4, r6 # borrow: undo what we just did to N and Q
- andc r8, r8, r10
- adde r3, r3, r5
- andc r7, r7, r9
-3: slwi r0, r9, 31 # unscale mask: M = M >> 1
- srwi r10, r10, 1
- or r10, r10, r0
- srwi r9, r9, 1
- slwi r0, r5, 31 # unscale divisor: D = D >> 1
- srwi r6, r6, 1
- or r6, r6, r0
- srwi r5, r5, 1
- or. r0, r10, r9 # iterate while M != 0
- bne 2b
+ cmplwi r5, 0 # DH == 0 ?
+ stwu r1, -32(r1)
+ mflr r0
+ stw r0, 8(r1)
+ stw r31, 12(r1)
+ beq 1f
+# The general case
+ stw r30, 16(r1)
+ stw r29, 20(r1)
+ stw r28, 24(r1)
+ mr r28, r3 # Save N in (r28, r29)
+ mr r29, r4
+ mr r30, r5 # Save D in (r30, r31)
+ mr r31, r6
+ # Scale N and D down, giving N' and D', such that 2^31 <= D' < 2^32
+ cntlzw r7, r5 # r7 = leading zeros in DH = 32 - shift amount
+ subfic r8, r7, 32 # r8 = shift amount
+ slw r0, r3, r7 # N' = N >> shift amount
+ srw r3, r3, r8
+ srw r4, r4, r8
+ or r4, r4, r0
+ slw r0, r5, r7 # D' = D >> shift amount
+ srw r6, r6, r8
+ or r5, r6, r0
+ # Divide N' by D' to get an approximate quotient Q
+ bl __i64_udiv6432 # r3 = quotient, r4 = remainder
+ mr r6, r3 # low half of quotient Q
+ li r5, 0 # high half of quotient is 0
+ # Tentative quotient is either correct or one too high
+ # Compute Q * D in (r7, r8)
+4: mullw r7, r6, r30 # r7 = Q * DH
+ mullw r8, r6, r31 # r8 = low 32 bits of Q * DL
+ mulhwu r0, r6, r31 # r0 = high 32 bits of Q * DL
+ addc r7, r7, r0
+ subfe. r0, r0, r0 # test carry: EQ iff carry
+ beq 2f # handle overflow case
+ # Compute R = N - Q * D, with borrow
+ subfc r4, r8, r29
+ subfe r3, r7, r28
+ subfe. r0, r0, r0 # test borrow: EQ iff no borrow
+ beq 3f # no borrow: N >= Q * D, we are good
+ addi r6, r6, -1 # borrow: adjust Q down by 1
+ addc r4, r4, r31 # and R up by D
+ adde r3, r3, r30
+ # Finished
+3: lwz r0, 8(r1)
+ mtlr r0
+ lwz r31, 12(r1)
+ lwz r30, 16(r1)
+ lwz r29, 20(r1)
+ lwz r28, 24(r1)
+ addi r1, r1, 32
blr
-
+ # Special case when Q * D overflows
+2: addi r6, r6, -1 # adjust Q down by 1
+ b 4b # and redo computation and check of remainder
+ .balign 16
+# Special case 64 bits divided by 32 bits
+1: cmplwi r3, 0 # NH == 0?
+ beq 4f
+ divwu r31, r3, r6 # Divide NH by DL, quotient QH in r31
+ mullw r0, r31, r6
+ subf r3, r0, r3 # NH is remainder of this division
+ mr r5, r6
+ bl __i64_udiv6432 # divide NH : NL by DL
+ mr r5, r31 # high word of quotient
+ mr r6, r3 # low word of quotient
+ # r4 contains low word of remainder
+ li r3, 0 # high word of remainder = 0
+ lwz r0, 8(r1)
+ mtlr r0
+ lwz r31, 12(r1)
+ addi r1, r1, 32
+ blr
+ .balign 16
+# Special case 32 bits divided by 32 bits
+4: mr r0, r6
+ divwu r6, r4, r6 # low word of quotient
+ li r5, 0 # high word of quotient is 0
+ mullw r0, r6, r0
+ subf r4, r0, r4 # low word of remainder
+ li r3, 0 # high word of remainder is 0
+ addi r1, r1, 32
+ blr
+
.type __i64_udivmod, @function
.size __i64_udivmod, .-__i64_udivmod
+
+# Auxiliary division function: 64 bit integer divided by 32 bit integer
+# Not exported
+# Input: numerator N in (r3,r4), divisor D in r5
+# Output: quotient Q in r3, remainder R in r4
+# Destroys: all integer caller-save registers
+# Assumes: high word of N is less than D
+
+ .balign 16
+__i64_udiv6432:
+# Algorithm 9.3 from Hacker's Delight, section 9.4
+# Initially: u1 in r3, u0 in r4, v in r5
+# s = __builtin_clz(v);
+ cntlzw r6, r5 # s in r6
+# v = v << s;
+ slw r5, r5, r6
+# vn1 = v >> 16; # vn1 in r7
+ srwi r7, r5, 16
+# vn0 = v & 0xFFFF; # vn0 in r8
+ rlwinm r8, r5, 0, 16, 31
+# un32 = (u1 << s) | (u0 >> 32 - s);
+ subfic r0, r6, 32
+ srw r0, r4, r0
+ slw r3, r3, r6 # u1 dies, un32 in r3
+ or r3, r3, r0
+# un10 = u0 << s;
+ slw r4, r4, r6 # u0 dies, un10 in r4
+# un1 = un10 >> 16;
+ srwi r9, r4, 16 # un1 in r9
+# un0 = un10 & 0xFFFF;
+ rlwinm r4, r4, 0, 16, 31 # un10 dies, un0 in r4
+# q1 = un32/vn1;
+ divwu r10, r3, r7 # q in r10
+# rhat = un32 - q1*vn1;
+ mullw r0, r10, r7
+ subf r11, r0, r3 # rhat in r11
+# again1:
+1:
+# if (q1 >= b || q1*vn0 > b*rhat + un1) {
+ cmplwi r10, 0xFFFF
+ bgt 2f
+ mullw r0, r10, r8
+ slwi r12, r11, 16
+ add r12, r12, r9
+ cmplw r0, r12
+ ble 3f
+2:
+# q1 = q1 - 1;
+ addi r10, r10, -1
+# rhat = rhat + vn1;
+ add r11, r11, r7
+# if (rhat < b) goto again1;}
+ cmplwi r11, 0xFFFF
+ ble 1b
+3:
+# un21 = un32*b + un1 - q1*v;
+ slwi r0, r3, 16 # un32 dies
+ add r9, r0, r9 # un1 dies
+ mullw r0, r10, r5
+ subf r9, r0, r9 # un21 in r9
+# q0 = un21/vn1;
+ divwu r3, r9, r7 # q0 in r3
+# rhat = un21 - q0*vn1;
+ mullw r0, r3, r7
+ subf r11, r0, r9 # rhat in r11
+# again2:
+4:
+# if (q0 >= b || q0*vn0 > b*rhat + un0) {
+ cmplwi r3, 0xFFFF
+ bgt 5f
+ mullw r0, r3, r8
+ slwi r12, r11, 16
+ add r12, r12, r4
+ cmplw r0, r12
+ ble 6f
+5:
+# q0 = q0 - 1;
+ addi r3, r3, -1
+# rhat = rhat + vn1;
+ add r11, r11, r7
+# if (rhat < b) goto again2;}
+ cmplwi r11, 0xFFFF
+ ble 4b
+6:
+# remainder = (un21*b + un0 - q0*v) >> s;
+ slwi r0, r9, 16
+ add r4, r0, r4 # un0 dies, remainder in r4
+ mullw r0, r3, r5
+ subf r4, r0, r4
+ srw r4, r4, r6
+# quotient = q1*b + q0;
+ slwi r0, r10, 16
+ add r3, r0, r3
+ blr
+
+ .type __i64_udiv6432, @function
+ .size __i64_udiv6432,.-__i64_udiv6432
diff --git a/runtime/test/test_int64.c b/runtime/test/test_int64.c
index 2aa2111..ab7a231 100644
--- a/runtime/test/test_int64.c
+++ b/runtime/test/test_int64.c
@@ -181,11 +181,11 @@ static void test1(u64 x, u64 y)
error++, printf("(s64) %a = %lld, expected %lld\n", f, z, (s64) f);
}
-#define NSPECIFIC 8
+#define NSPECIFIC 9
unsigned long long specific[NSPECIFIC] = {
0, 1, -1, 0x7FFFFFFFULL, 0x80000000ULL, 0xFFFFFFFFULL,
- 0x7FFFFFFFFFFFULL, 0x8000000000000000ULL
+ 0x7FFFFFFFFFFFULL, 0x8000000000000000ULL, 0x100000003ULL
};
int main()