From 90c59282be5ba1f2cd7c2561d17f0559c140c597 Mon Sep 17 00:00:00 2001
From: Jason Gross <jgross@mit.edu>
Date: Fri, 25 Jan 2019 16:19:49 -0500
Subject: Reduce more in mulmod and squaremod

After some debugging with Andres, the issue seems to be that when we
have multiple TAPs, the first reduction will leave some of the partial
products past the end of the table, because it "folds over" the partial
products table at the location of each TAP.  In the worst case, if we
have a TAP at 1 and a TAP at the second-highest limb, we will have to
reduce as many times as there are limbs.  To prevent quadratic blowup,
we short-circuit the repeat_reduce calculation whenever the list of high
terms is nil.  We deliberately do not support primes with a TAP in the
highest limb.

After     | File Name                                                   | Before    || Change    | % Change
-----------------------------------------------------------------------------------------------------------
17m49.68s | Total                                                       | 17m34.83s || +0m14.84s | +1.40%
-----------------------------------------------------------------------------------------------------------
2m22.88s  | PushButtonSynthesis/WordByWordMontgomeryReificationCache.vo | 2m21.60s  || +0m01.28s | +0.90%
2m19.62s  | Arithmetic.vo                                               | 2m17.66s  || +0m01.96s | +1.42%
0m34.02s  | PushButtonSynthesis/UnsaturatedSolinasReificationCache.vo   | 0m32.24s  || +0m01.78s | +5.52%
0m32.06s  | ExtractionHaskell/unsaturated_solinas                       | 0m30.99s  || +0m01.07s | +3.45%
0m19.95s  | SlowPrimeSynthesisExamples.vo                               | 0m18.40s  || +0m01.55s | +8.42%
3m12.02s  | p384_32.c                                                   | 3m12.03s  || -0m00.00s | -0.00%
1m02.14s  | Fancy/Montgomery256.vo                                      | 1m02.09s  || +0m00.04s | +0.08%
0m51.24s  | Fancy/Barrett256.vo                                         | 0m51.28s  || -0m00.03s | -0.07%
0m45.72s  | ExtractionHaskell/word_by_word_montgomery                   | 0m45.05s  || +0m00.67s | +1.48%
0m38.67s  | p521_32.c                                                   | 0m38.24s  || +0m00.42s | +1.12%
0m31.94s  | p521_64.c                                                   | 0m31.38s  || +0m00.56s | +1.78%
0m24.85s  | ExtractionHaskell/saturated_solinas                         | 0m24.45s  || +0m00.40s | +1.63%
0m23.53s  | PushButtonSynthesis/MontgomeryReductionReificationCache.vo  | 0m22.79s  || +0m00.74s | +3.24%
0m21.96s  | PushButtonSynthesis/WordByWordMontgomery.vo                 | 0m21.88s  || +0m00.08s | +0.36%
0m19.60s  | PushButtonSynthesis/UnsaturatedSolinas.vo                   | 0m19.56s  || +0m00.04s | +0.20%
0m17.76s  | ExtractionOCaml/word_by_word_montgomery                     | 0m17.26s  || +0m00.50s | +2.89%
0m14.90s  | p448_solinas_64.c                                           | 0m14.91s  || -0m00.00s | -0.06%
0m13.55s  | secp256k1_32.c                                              | 0m13.39s  || +0m00.16s | +1.19%
0m13.31s  | p256_32.c                                                   | 0m13.06s  || +0m00.25s | +1.91%
0m13.30s  | PushButtonSynthesis/BarrettReductionReificationCache.vo     | 0m12.44s  || +0m00.86s | +6.91%
0m11.47s  | p484_64.c                                                   | 0m11.31s  || +0m00.16s | +1.41%
0m11.43s  | ExtractionOCaml/unsaturated_solinas                         | 0m11.15s  || +0m00.27s | +2.51%
0m10.55s  | ExtractionOCaml/word_by_word_montgomery.ml                  | 0m10.66s  || -0m00.10s | -1.03%
0m08.42s  | ExtractionOCaml/saturated_solinas                           | 0m08.24s  || +0m00.17s | +2.18%
0m07.07s  | ExtractionOCaml/unsaturated_solinas.ml                      | 0m07.04s  || +0m00.03s | +0.42%
0m06.66s  | COperationSpecifications.vo                                 | 0m06.31s  || +0m00.35s | +5.54%
0m06.50s  | ExtractionHaskell/word_by_word_montgomery.hs                | 0m06.39s  || +0m00.11s | +1.72%
0m06.19s  | p224_32.c                                                   | 0m05.99s  || +0m00.20s | +3.33%
0m05.35s  | p384_64.c                                                   | 0m05.24s  || +0m00.10s | +2.09%
0m05.26s  | ExtractionOCaml/saturated_solinas.ml                        | 0m05.24s  || +0m00.01s | +0.38%
0m05.24s  | ExtractionHaskell/unsaturated_solinas.hs                    | 0m05.17s  || +0m00.07s | +1.35%
0m04.23s  | ExtractionHaskell/saturated_solinas.hs                      | 0m04.12s  || +0m00.11s | +2.66%
0m04.14s  | PushButtonSynthesis/SaturatedSolinasReificationCache.vo     | 0m03.89s  || +0m00.24s | +6.42%
0m03.68s  | PushButtonSynthesis/Primitives.vo                           | 0m03.62s  || +0m00.06s | +1.65%
0m03.58s  | PushButtonSynthesis/SmallExamples.vo                        | 0m03.30s  || +0m00.28s | +8.48%
0m03.49s  | PushButtonSynthesis/SaturatedSolinas.vo                     | 0m03.25s  || +0m00.24s | +7.38%
0m02.38s  | curve25519_32.c                                             | 0m02.28s  || +0m00.10s | +4.38%
0m01.66s  | PushButtonSynthesis/BarrettReduction.vo                     | 0m01.39s  || +0m00.27s | +19.42%
0m01.47s  | curve25519_64.c                                             | 0m01.42s  || +0m00.05s | +3.52%
0m01.34s  | CLI.vo                                                      | 0m01.45s  || -0m00.10s | -7.58%
0m01.32s  | PushButtonSynthesis/MontgomeryReduction.vo                  | 0m01.30s  || +0m00.02s | +1.53%
0m01.08s  | secp256k1_64.c                                              | 0m01.04s  || +0m00.04s | +3.84%
0m01.07s  | StandaloneOCamlMain.vo                                      | 0m01.10s  || -0m00.03s | -2.72%
0m01.04s  | StandaloneHaskellMain.vo                                    | 0m01.12s  || -0m00.08s | -7.14%
0m01.04s  | p224_64.c                                                   | 0m01.14s  || -0m00.09s | -8.77%
0m01.00s  | p256_64.c                                                   | 0m00.98s  || +0m00.02s | +2.04%
---
 src/Arithmetic.v | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/Arithmetic.v b/src/Arithmetic.v
index 4436bdf0e..dc4c5031b 100644
--- a/src/Arithmetic.v
+++ b/src/Arithmetic.v
@@ -178,6 +178,47 @@ Module Associational.
     autorewrite with zsimplify_const; rewrite eval_split; trivial.
   Qed.
 
+  (* reduce at most [n] times, stopping early if the high list is nil at any point *)
+  Definition repeat_reduce (n : nat) (s:Z) (c:list _) (p:list _) : list (Z * Z)
+    := nat_rect
+         _
+         (fun p => p)
+         (fun n' repeat_reduce_n' p
+          => let lo_hi := split s p in
+             if (length (snd lo_hi) =? 0)%nat
+             then p
+             else let p := fst lo_hi ++ mul c (snd lo_hi) in
+                  repeat_reduce_n' p)
+         n
+         p.
+
+  Lemma repeat_reduce_S_step n s c p
+    : repeat_reduce (S n) s c p
+      = if (length (snd (split s p)) =? 0)%nat
+        then p
+        else repeat_reduce n s c (reduce s c p).
+  Proof using Type. cbv [repeat_reduce]; cbn [nat_rect]; break_innermost_match; auto. Qed.
+
+  Lemma eval_repeat_reduce n s c p (s_nz:s<>0) (modulus_nz:s-eval c<>0) :
+    eval (repeat_reduce n s c p) mod (s - eval c) = eval p mod (s - eval c).
+  Proof using Type.
+    revert p; induction n as [|n IHn]; intro p; [ reflexivity | ];
+      rewrite repeat_reduce_S_step; break_innermost_match;
+        [ reflexivity | rewrite IHn ].
+    now rewrite eval_reduce.
+  Qed.
+  Hint Rewrite eval_repeat_reduce : push_eval.
+
+  Lemma eval_repeat_reduce_adjusted n s c p w c' (s_nz:s<>0) (modulus_nz:s-eval c<>0)
+        (w_mod:w mod s = 0) (w_nz:w <> 0) (Hc' : eval c' = (w / s) * eval c) :
+    eval (repeat_reduce n w c' p) mod (s - eval c) = eval p mod (s - eval c).
+  Proof using Type.
+    revert p; induction n as [|n IHn]; intro p; [ reflexivity | ];
+      rewrite repeat_reduce_S_step; break_innermost_match;
+        [ reflexivity | rewrite IHn ].
+    now rewrite eval_reduce_adjusted.
+  Qed.
+
   (*
   Definition splitQ (s:Q) (p:list (Z*Z)) : list (Z*Z) * list (Z*Z)
     := let hi_lo := partition (fun t => (fst t * Zpos (Qden s)) mod (Qnum s) =? 0) p in
@@ -789,7 +830,7 @@ Module Positional.
       := let a_a := to_associational n a in
          let b_a := to_associational n b in
          let ab_a := Associational.mul a_a b_a in
-         let abm_a := Associational.reduce s c ab_a in
+         let abm_a := Associational.repeat_reduce n s c ab_a in
          from_associational n abm_a.
     Lemma eval_mulmod n (f g:list Z)
           (Hf : length f = n) (Hg : length g = n) :
@@ -797,13 +838,14 @@ Module Positional.
       = (eval n f * eval n g) mod (s - Associational.eval c).
     Proof using m_nz s_nz weight_0 weight_nz. cbv [mulmod]; push; trivial.
     destruct f, g; simpl in *; [ right; subst n | left; try omega.. ].
-    clear; cbv -[Associational.reduce].
+    clear; cbv -[Associational.repeat_reduce].
     induction c as [|?? IHc]; simpl; trivial.                 Qed.
 
     Definition squaremod (n:nat) (a:list Z) : list Z
       := let a_a := to_associational n a in
          let aa_a := Associational.reduce_square s c a_a in
-         from_associational n aa_a.
+         let aam_a := Associational.repeat_reduce (pred n) s c aa_a in
+         from_associational n aam_a.
     Lemma eval_squaremod n (f:list Z)
           (Hf : length f = n) :
       eval n (squaremod n f) mod (s - Associational.eval c)
-- 
cgit v1.2.3