tweak binders so that shifts from base conversion get inlined

author: Jade Philipoom <jadep@google.com> 2018-05-17 01:23:27 +0200
committer: Jade Philipoom <jadep@google.com> 2018-05-31 13:46:48 +0200
commit: 9affad3455894694ccaa62f311c2f56a2c708b61 (patch)
tree: 7e4ee44c2a59f290f3be283ffca8792802de5fdd
parent: de9687da6f873de2f481524d252a238655a6e9f9 (diff)
1 files changed, 212 insertions, 192 deletions
diff --git a/src/Experiments/SimplyTypedArithmetic.v b/src/Experiments/SimplyTypedArithmetic.v
index 3baaccfd1..416938689 100644
--- a/src/Experiments/SimplyTypedArithmetic.v
+++ b/src/Experiments/SimplyTypedArithmetic.v
@@ -117,6 +117,15 @@ Module Associational.
          rewrite <-reduction_rule, eval_split; trivial.      Qed.
   Hint Rewrite eval_reduce : push_eval.
 
+  Definition bind_snd (p : list (Z*Z)) :=
+    map (fun t => dlet_nd t2 := snd t in (fst t, t2)) p.
+
+  Lemma bind_snd_correct p : bind_snd p = p.
+  Proof.
+    cbv [bind_snd]; induction p as [| [? ?] ];
+      push; [|rewrite IHp]; reflexivity.
+  Qed.
+
   Section Carries.
     Definition carryterm (w fw:Z) (t:Z * Z) :=
       if (Z.eqb (fst t) w)
@@ -1754,7 +1763,7 @@ Module BaseConversion.
     (* carry at specified indices in dw, then use Rows.flatten to convert to Positional with sw *)
     Definition from_associational idxs n (p : list (Z * Z)) : list Z :=
       (* important not to use Positional.carry here; we don't want to accumulate yet *)
-      let p' := fold_right (fun i acc => Associational.carry (dw i) (dw (S i) / dw i) acc) p (rev idxs) in
+      let p' := fold_right (fun i acc => Associational.carry (dw i) (dw (S i) / dw i) acc) (Associational.bind_snd p) (rev idxs) in
       fst (Rows.flatten sw n (Rows.from_associational sw n p')).
 
     Lemma eval_carries p idxs :
@@ -1775,6 +1784,7 @@ Module BaseConversion.
     Proof.
       cbv [from_associational]; intros.
       rewrite Rows.flatten_mod by eauto using Rows.length_from_associational.
+      rewrite Associational.bind_snd_correct.
       push_eval.
     Qed.
     Hint Rewrite eval_from_associational using solve [push_eval; distr_length] : push_eval.
@@ -1785,6 +1795,7 @@ Module BaseConversion.
     Proof.
       intros; cbv [from_associational].
       rewrite Rows.flatten_partitions with (n:=n) by (eauto using Rows.length_from_associational; omega).
+      rewrite Associational.bind_snd_correct.
       push_eval.
     Qed.
 
@@ -1793,21 +1804,41 @@ Module BaseConversion.
     Proof.
       intros. cbv [from_associational].
       rewrite Rows.flatten_partitions' with (n:=n) by eauto using Rows.length_from_associational.
+      rewrite Associational.bind_snd_correct.
       push_eval.
     Qed.
 
-    (* bind the input, but *not* the carrying operations *)
     Derive from_associational_inlined
-           SuchThat (forall n idxs p,
-                        from_associational_inlined n idxs p = from_associational n idxs p)
+           SuchThat (forall idxs n p,
+                        from_associational_inlined idxs n p = from_associational idxs n p)
            As from_associational_inlined_correct.
     Proof.
       intros.
       cbv beta iota delta [from_associational Associational.carry Associational.carryterm].
-      unfold Let_In at 2 3.
+      cbv beta iota delta [Let_In]. (* inlines all shifts/lands from carryterm *)
+      cbv beta iota delta [from_associational Rows.from_associational Columns.from_associational].
+      cbv beta iota delta [Let_In]. (* inlines the shifts from place *)
       subst from_associational_inlined; reflexivity.
     Qed.
 
+    Derive to_associational_inlined
+           SuchThat (forall n m p,
+                        to_associational_inlined n m p = to_associational n m p)
+           As to_associational_inlined_correct.
+    Proof.
+      intros.
+      cbv beta iota delta [ to_associational convert_bases
+                                             Positional.to_associational
+                                             Positional.from_associational
+                                             chained_carries_no_reduce
+                                             carry
+                                             Associational.carry
+                                             Associational.carryterm
+                          ].
+      cbv beta iota delta [Let_In].
+      subst to_associational_inlined; reflexivity.
+    Qed.
+
     (* carry chain that aligns terms in the intermediate weight with the final weight *)
     Definition aligned_carries (log_dw_sw nout : nat)
       := (map (fun i => ((log_dw_sw * (i + 1)) - 1))%nat (seq 0 nout)).
@@ -1870,6 +1901,9 @@ Module BaseConversion.
       Z.rewrite_mod_small. reflexivity.
     Qed.
 
+    (* For some reason, this is a universe inconsistency if not factored out *)
+    Lemma nout_nonzero : nout <> 0%nat. Proof. omega. Qed.
+
     Derive widemul_inlined
            SuchThat (forall a b,
                         0 <= a * b < 2^log2base * 2^log2base ->
@@ -1878,12 +1912,10 @@ Module BaseConversion.
     Proof.
       intros.
       rewrite <-widemul_correct by auto.
-      cbv beta iota delta [widemul mul_converted to_associational convert_bases
-                                   chained_carries_no_reduce carry
-                                   Associational.carry Associational.carryterm
-                                   Positional.from_associational].
-      cbv beta iota delta [Let_In].
-      rewrite <-from_associational_inlined_correct.
+      cbv beta iota delta [widemul mul_converted].
+      rewrite <-to_associational_inlined_correct with (p:=[a]).
+      rewrite <-to_associational_inlined_correct with (p:=[b]).
+      rewrite <-from_associational_inlined_correct by (apply nout_nonzero || assumption).
       subst widemul_inlined; reflexivity.
     Qed.
   End widemul.
@@ -10055,43 +10087,50 @@ Module Barrett256.
      This is why their results are not cast (because the carry has range [-1~>0]). *)
   (*
 barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.Z * type.type_primitive type.Z)%ctype,
-                 expr_let x0 := SELM (x₂, 0, 26959946667150639793205513449348445388433292963828203772348655992835) in
-                 expr_let x1 := RSHI (0, x₂, 255) in
-                 expr_let x2 := RSHI (x₂, x₁, 255) in
-                 expr_let x3 := 79228162514264337589248983038 *₂₅₆ ((uint128)(x2) & 340282366920938463463374607431768211455) in
-                 expr_let x4 := 340282366841710300930663525764514709507 *₂₅₆ (uint128)(x2 >> 128) in
-                 expr_let x5 := 79228162514264337589248983038 *₂₅₆ (uint128)(x2 >> 128) in
-                 expr_let x6 := (uint256)(((uint128)(x3) & 340282366920938463463374607431768211455) << 128) in
-                 expr_let x7 := (uint128)(x3 >> 128) in
-                 expr_let x8 := (uint256)(((uint128)(x4) & 340282366920938463463374607431768211455) << 128) in
-                 expr_let x9 := (uint128)(x4 >> 128) in
-                 expr_let x10 := 340282366841710300930663525764514709507 *₂₅₆ ((uint128)(x2) & 340282366920938463463374607431768211455) in
-                 expr_let x11 := ADD_256 (x8, x10) in
-                 expr_let x12 := ADDC_128 (x11₂, x7, x9) in
-                 expr_let x13 := ADD_256 (x6, x11₁) in
-                 expr_let x14 := ADDC_256 (x13₂, x5, x12₁) in
-                 expr_let x15 := ADD_256 (x2, x14₁) in
-                 expr_let x16 := ADDC_128 (x15₂, 0, x1) in
-                 expr_let x17 := ADD_256 (x0, x15₁) in
-                 expr_let x18 := ADDC_128 (x17₂, 0, x16₁) in
-                 expr_let x19 := RSHI (x18₁, x17₁, 1) in
-                 expr_let x20 := 79228162514264337593543950335 *₂₅₆ (uint128)(x19 >> 128) in
-                 expr_let x21 := 340282366841710300967557013911933812736 *₂₅₆ ((uint128)(x19) & 340282366920938463463374607431768211455) in
-                 expr_let x22 := 340282366841710300967557013911933812736 *₂₅₆ (uint128)(x19 >> 128) in
-                 expr_let x23 := (uint256)(((uint128)(x20) & 340282366920938463463374607431768211455) << 128) in
-                 expr_let x24 := (uint128)(x20 >> 128) in
-                 expr_let x25 := (uint256)(((uint128)(x21) & 340282366920938463463374607431768211455) << 128) in
-                 expr_let x26 := (uint128)(x21 >> 128) in
-                 expr_let x27 := 79228162514264337593543950335 *₂₅₆ ((uint128)(x19) & 340282366920938463463374607431768211455) in
-                 expr_let x28 := ADD_256 (x25, x27) in
-                 expr_let x29 := ADDC_256 (x28₂, x24, x26) in
-                 expr_let x30 := ADD_256 (x23, x28₁) in
-                 expr_let x31 := ADDC_256 (x30₂, x22, x29₁) in
-                 expr_let x32 := Z.add_get_carry_concrete 115792089237316195423570985008687907853269984665640564039457584007913129639936 @@ (Z.opp @@ (fst @@ x30), x₁) in
-                 expr_let x33 := Z.add_with_get_carry_concrete 115792089237316195423570985008687907853269984665640564039457584007913129639936 @@ (x32₂, Z.opp @@ (fst @@ x31), x₂) in
-                 expr_let x34 := SELL (x33₁, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951) in
-                 expr_let x35 := Z.cast uint256 @@ (fst @@ SUB_256 (x32₁, x34)) in
-                 ADDM (x35, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951)
+                expr_let x0 := SELM (x₂, 0,
+                            26959946667150639793205513449348445388433292963828203772348655992835) in
+                expr_let x1 := RSHI (0, x₂, 255) in
+                expr_let x2 := RSHI (x₂, x₁, 255) in
+                expr_let x3 := 79228162514264337589248983038 *₂₅₆ (uint128)(x2 >> 128) in
+                expr_let x4 := 79228162514264337589248983038 *₂₅₆
+                            ((uint128)(x2) & 340282366920938463463374607431768211455) in
+                expr_let x5 := 340282366841710300930663525764514709507 *₂₅₆ (uint128)(x2 >> 128) in
+                expr_let x6 := 340282366841710300930663525764514709507 *₂₅₆
+                            ((uint128)(x2) & 340282366920938463463374607431768211455) in
+                expr_let x7 := ADD_256 ((uint256)(((uint128)(x5) & 340282366920938463463374607431768211455) << 128),
+                            x6) in
+                expr_let x8 := ADDC_128 (x7₂, (uint128)(x4 >> 128), (uint128)(x5 >> 128)) in
+                expr_let x9 := ADD_256 ((uint256)(((uint128)(x4) & 340282366920938463463374607431768211455) << 128),
+                            x7₁) in
+                expr_let x10 := ADDC_256 (x9₂, x3, x8₁) in
+                expr_let x11 := ADD_256 (x2, x10₁) in
+                expr_let x12 := ADDC_128 (x11₂, 0, x1) in
+                expr_let x13 := ADD_256 (x0, x11₁) in
+                expr_let x14 := ADDC_128 (x13₂, 0, x12₁) in
+                expr_let x15 := RSHI (x14₁, x13₁, 1) in
+                expr_let x16 := 340282366841710300967557013911933812736 *₂₅₆ (uint128)(x15 >> 128) in
+                expr_let x17 := 79228162514264337593543950335 *₂₅₆ (uint128)(x15 >> 128) in
+                expr_let x18 := 340282366841710300967557013911933812736 *₂₅₆
+                                ((uint128)(x15) & 340282366920938463463374607431768211455) in
+                expr_let x19 := 79228162514264337593543950335 *₂₅₆
+                                ((uint128)(x15) & 340282366920938463463374607431768211455) in
+                expr_let x20 := ADD_256 ((uint256)(((uint128)(x18) & 340282366920938463463374607431768211455) << 128),
+                                x19) in
+                expr_let x21 := ADDC_256 (x20₂, (uint128)(x17 >> 128), (uint128)(x18 >> 128)) in
+                expr_let x22 := ADD_256 ((uint256)(((uint128)(x17) & 340282366920938463463374607431768211455) << 128),
+                                x20₁) in
+                expr_let x23 := ADDC_256 (x22₂, x16, x21₁) in
+                expr_let x24 := Z.add_get_carry_concrete
+                                115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                                (Z.opp @@ (fst @@ x22), x₁) in
+                expr_let x25 := Z.add_with_get_carry_concrete
+                                115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                                (x24₂, Z.opp @@ (fst @@ x23), x₂) in
+                expr_let x26 := SELL (x25₁, 0,
+                                115792089210356248762697446949407573530086143415290314195533631308867097853951) in
+                expr_let x27 := Z.cast uint256 @@ (fst @@ SUB_256 (x24₁, x26)) in
+                ADDM (x27, 0,
+                115792089210356248762697446949407573530086143415290314195533631308867097853951)
      : Expr (type.uncurry (type.type_primitive type.Z -> type.type_primitive type.Z -> type.type_primitive type.Z))
   *)
 
@@ -10104,42 +10143,39 @@ barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.
   Local Notation "'RegMod'" := (Straightline.expr.Primitive (t:=type.Z) 115792089210356248762697446949407573530086143415290314195533631308867097853951).
   Local Notation "'RegMuLow'" := (Straightline.expr.Primitive (t:=type.Z) 26959946667150639793205513449348445388433292963828203772348655992835).
   Print barrett_red256_prefancy.
-  (* TODO : make prefancy cast uint128s to uint256s *)
   (* Note : currently (correctly) fails to convert the adds that should be subs *)
   (*
     selm@(y, $x₂, RegZero, RegMuLow);
     rshi@(y0, RegZero, $x₂,255);
     rshi@(y1, $x₂, $x₁,255);
-    mulhl@(y2, RegMuLow, $y1);
-    mullh@(y3, RegMuLow, $y1);
-    mulhh@(y4, RegMuLow, $y1);
-    shiftL@(y5, $y2, 128);
-    shiftR@(y6, $y2, 128);
-    shiftL@(y7, $y3, 128);
-    shiftR@(y8, $y3, 128);
-    mulll@(y9, RegMuLow, $y1);
-    add@(y10, $y7, $y9);
-    addc@(y11, carry{$y10}, $y6, $y8);
-    add@(y12, $y5, $y10);
-    addc@(y13, carry{$y12}, $y4, $y11);
-    add@(y14, $y1, $y13);
-    addc@(y15, carry{$y14}, RegZero, $y0);
-    add@(y16, $y, $y14);
-    addc@(y17, carry{$y16}, RegZero, $y15);
-    rshi@(y18, $y17, $y16,1);
-    mullh@(y19, RegMod, $y18);
-    mulhl@(y20, RegMod, $y18);
-    mulhh@(y21, RegMod, $y18);
-    shiftL@(y22, $y19, 128);
-    shiftR@(y23, $y19, 128);
-    shiftL@(y24, $y20, 128);
-    shiftR@(y25, $y20, 128);
-    mulll@(y26, RegMod, $y18);
-    add@(y27, $y24, $y26);
-    addc@(y28, carry{$y27}, $y23, $y25);
-    add@(y29, $y22, $y27);
-    addc@(_, carry{$y29}, $y21, $y28);
-    Straightline.expr.Scalar (Straightline.expr.Primitive (-1))
+    mulhh@(y2, RegMuLow, $y1);
+    mulhl@(y3, RegMuLow, $y1);
+    mullh@(y4, RegMuLow, $y1);
+    mulll@(y5, RegMuLow, $y1);
+    add@(y6, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y4})), $y5);
+    addc@(y7, carry{$y6}, Straightline.expr.Cast uint128
+                            (Straightline.expr.Shiftr 128 ($y3)), 
+    Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y4)));
+    add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y3})), $y6);
+    addc@(y9, carry{$y8}, $y2, $y7);
+    add@(y10, $y1, $y9);
+    addc@(y11, carry{$y10}, RegZero, $y0);
+    add@(y12, $y, $y10);
+    addc@(y13, carry{$y12}, RegZero, $y11);
+    rshi@(y14, $y13, $y12,1);
+    mulhh@(y15, RegMod, $y14);
+    mullh@(y16, RegMod, $y14);
+    mulhl@(y17, RegMod, $y14);
+    mulll@(y18, RegMod, $y14);
+    add@(y19, Straightline.expr.Cast uint256
+                (Straightline.expr.Shiftl 128 (Lower{$y17})), $y18);
+    addc@(y20, carry{$y19}, Straightline.expr.Cast uint128
+                            (Straightline.expr.Shiftr 128 ($y16)), 
+    Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y17)));
+    add@(y21, Straightline.expr.Cast uint256
+                (Straightline.expr.Shiftl 128 (Lower{$y16})), $y19);
+    addc@(_, carry{$y21}, $y15, $y20);
+    Straightline.expr.Scalar (Straightline.expr.Primitive (-1)) 
   *)
 End Barrett256.
 
@@ -10592,7 +10628,7 @@ Module MontgomeryReduction.
       assert (0 <= T mod R * N' < w 2) by (solve_range).
 
       rewrite !BaseConversion.widemul_inlined_correct
-        by (rewrite ?BaseConversion.widemul_correct; autorewrite with push_nth_default; solve_range).
+        by (rewrite ?BaseConversion.widemul_inlined_correct; autorewrite with push_nth_default; solve_range).
       rewrite Rows.add_partitions, Rows.add_div by (distr_length; apply wprops; omega).
       rewrite R_two_pow.
       cbv [Rows.partition seq]. rewrite !eval2.
@@ -10697,6 +10733,13 @@ Module Montgomery256.
          As montred256_correct.
   Proof. Time solve_rmontred machine_wordsize. Time Qed.
 
+  Definition montred256_prefancy' := PreFancy.of_Expr machine_wordsize [N;N'] montred256.
+
+  Derive montred256_prefancy
+         SuchThat (montred256_prefancy = montred256_prefancy' type.interp)
+         As montred256_prefancy_eq.
+  Proof. lazy - [type.interp]; reflexivity. Qed.
+
   Lemma montred'_correct_specialized R' (R'_correct : Z.equiv_modulo N (R * R') 1) :
     forall (lo hi : Z),
       0 <= lo < R -> 0 <= hi < R -> 0 <= lo + R * hi < R * N ->
@@ -10718,8 +10761,16 @@ Module Montgomery256.
         (t:=type.prod type.Z type.Z)
         (Some r[0 ~> 2 ^ machine_wordsize - 1]%zrange, Some r[0 ~> 2 ^ machine_wordsize - 1]%zrange)
         xy = true ->
-       expr.Interp (@ident.interp) montred256 xy = MontgomeryReduction.montred' N R N' (Z.log2 R) 2 2 xy.
+       expr.Interp (@ident.interp) montred256 xy = app_curried (t:=type.arrow (type.prod type.Z type.Z) type.Z) (MontgomeryReduction.montred' N R N' (Z.log2 R) 2 2) xy.
   Proof. intros; destruct (montred256_correct xy); assumption. Qed.
+  Lemma montred256_correct_proj2' :
+    forall xy : type.interp (type.prod type.Z type.Z),
+      ZRange.type.option.is_bounded_by
+        (t:=type.prod type.Z type.Z)
+        (Some r[0 ~> 2 ^ machine_wordsize - 1]%zrange, Some r[0 ~> 2 ^ machine_wordsize - 1]%zrange)
+        xy = true ->
+       expr.Interp (@ident.interp) montred256 xy = MontgomeryReduction.montred' N R N' (Z.log2 R) 2 2 xy.
+  Proof. intros; rewrite montred256_correct_proj2 by assumption; unfold app_curried; exact eq_refl. Qed.
 
   Lemma montred256_correct_full  R' (R'_correct : Z.equiv_modulo N (R * R') 1) :
     forall (lo hi : Z),
@@ -10728,7 +10779,7 @@ Module Montgomery256.
   Proof.
     intros.
     rewrite <-montred'_correct_specialized by assumption.
-    rewrite <-montred256_correct_proj2.
+    rewrite <-montred256_correct_proj2'.
     { cbv [expr.Interp type.uncurried_domain type.uncurry type.final_codomain].
       reflexivity. }
     { cbn. rewrite !andb_true_iff. cbv [R N] in *.
@@ -10766,13 +10817,6 @@ Module Montgomery256.
     | |- context [PreFancy.ok_expr] => constructor; cbn [fst snd]; repeat ok_expr_step'
     end; intros; cbn [Nat.max].
 
-  Definition montred256_prefancy' := PreFancy.of_Expr machine_wordsize [N;N'] montred256.
-
-  Derive montred256_prefancy
-         SuchThat (montred256_prefancy = montred256_prefancy' type.interp)
-         As montred256_prefancy_eq.
-  Proof. lazy - [type.interp]; reflexivity. Qed.
-
   Lemma montred256_prefancy_correct R' (R'_correct : Z.equiv_modulo N (R * R') 1) :
     forall (lo hi : Z) dummy_arrow,
       0 <= lo < R -> 0 <= hi < R -> 0 <= lo + R * hi < R * N ->
@@ -10802,28 +10846,22 @@ Module Montgomery256.
 montred256 = fun var : type -> Type => (λ x : var (type.type_primitive type.Z * type.type_primitive type.Z)%ctype,
     expr_let x0 := 79228162514264337593543950337 *₂₅₆ (uint128)(x₁ >> 128) in
     expr_let x1 := 340282366841710300986003757985643364352 *₂₅₆ ((uint128)(x₁) & 340282366920938463463374607431768211455) in
-    expr_let x2 := (uint256)(((uint128)(x0) & 340282366920938463463374607431768211455) << 128) in
-    expr_let x3 := (uint256)(((uint128)(x1) & 340282366920938463463374607431768211455) << 128) in
-    expr_let x4 := 79228162514264337593543950337 *₂₅₆ ((uint128)(x₁) & 340282366920938463463374607431768211455) in
-    expr_let x5 := ADD_256 (x3, x4) in
-    expr_let x6 := ADD_256 (x2, x5₁) in
-    expr_let x7 := 79228162514264337593543950335 *₂₅₆ (uint128)(x6₁ >> 128) in
-    expr_let x8 := 340282366841710300967557013911933812736 *₂₅₆ ((uint128)(x6₁) & 340282366920938463463374607431768211455) in
-    expr_let x9 := 340282366841710300967557013911933812736 *₂₅₆ (uint128)(x6₁ >> 128) in
-    expr_let x10 := (uint256)(((uint128)(x7) & 340282366920938463463374607431768211455) << 128) in
-    expr_let x11 := (uint128)(x7 >> 128) in
-    expr_let x12 := (uint256)(((uint128)(x8) & 340282366920938463463374607431768211455) << 128) in
-    expr_let x13 := (uint128)(x8 >> 128) in
-    expr_let x14 := 79228162514264337593543950335 *₂₅₆ ((uint128)(x6₁) & 340282366920938463463374607431768211455) in
-    expr_let x15 := ADD_256 (x12, x14) in
-    expr_let x16 := ADDC_256 (x15₂, x11, x13) in
-    expr_let x17 := ADD_256 (x10, x15₁) in
-    expr_let x18 := ADDC_256 (x17₂, x9, x16₁) in
-    expr_let x19 := ADD_256 (x17₁, x₁) in
-    expr_let x20 := ADDC_256 (x19₂, x18₁, x₂) in
-    expr_let x21 := SELC (x20₂, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951) in
-    expr_let x22 := SUB_256 (x20₁, x21) in
-    ADDM (x22₁, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951))%expr
+    expr_let x2 := 79228162514264337593543950337 *₂₅₆ ((uint128)(x₁) & 340282366920938463463374607431768211455) in
+    expr_let x3 := ADD_256 ((uint256)(((uint128)(x1) & 340282366920938463463374607431768211455) << 128), x2) in
+    expr_let x4 := ADD_256 ((uint256)(((uint128)(x0) & 340282366920938463463374607431768211455) << 128), x3₁) in
+    expr_let x5 := 340282366841710300967557013911933812736 *₂₅₆ (uint128)(x4₁ >> 128) in
+    expr_let x6 := 79228162514264337593543950335 *₂₅₆ (uint128)(x4₁ >> 128) in
+    expr_let x7 := 340282366841710300967557013911933812736 *₂₅₆ ((uint128)(x4₁) & 340282366920938463463374607431768211455) in
+    expr_let x8 := 79228162514264337593543950335 *₂₅₆ ((uint128)(x4₁) & 340282366920938463463374607431768211455) in
+    expr_let x9 := ADD_256 ((uint256)(((uint128)(x7) & 340282366920938463463374607431768211455) << 128), x8) in
+    expr_let x10 := ADDC_256 (x9₂, (uint128)(x6 >> 128), (uint128)(x7 >> 128)) in
+    expr_let x11 := ADD_256 ((uint256)(((uint128)(x6) & 340282366920938463463374607431768211455) << 128), x9₁) in
+    expr_let x12 := ADDC_256 (x11₂, x5, x10₁) in
+    expr_let x13 := ADD_256 (x11₁, x₁) in
+    expr_let x14 := ADDC_256 (x13₂, x12₁, x₂) in
+    expr_let x15 := SELC (x14₂, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951) in
+    expr_let x16 := SUB_256 (x14₁, x15) in
+    ADDM (x16₁, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951))%expr
      : Expr (type.uncurry (type.type_primitive type.Z * type.type_primitive type.Z -> type.type_primitive type.Z))
 *)
 
@@ -10834,29 +10872,23 @@ montred256 = fun var : type -> Type => (λ x : var (type.type_primitive type.Z *
   Print montred256_prefancy.
   (*
    mulhl@(y0, RegPInv, $x₁);
-   shiftL@(y1, $y, 128);
-   shiftL@(y2, $y0, 128);
-   mulll@(y3, RegPInv, $x₁);
-   add@(y4, $y2, $y3);
-   add@(y5, $y1, $y4);
-   mullh@(y6, RegMod, $y5);
-   mulhl@(y7, RegMod, $y5);
-   mulhh@(y8, RegMod, $y5);
-   shiftL@(y9, $y6, 128);
-   shiftR@(y10, $y6, 128);
-   shiftL@(y11, $y7, 128);
-   shiftR@(y12, $y7, 128);
-   mulll@(y13, RegMod, $y5);
-   add@(y14, $y11, $y13);
-   addc@(y15, carry{$y14}, $y10, $y12);
-   add@(y16, $y9, $y14);
-   addc@(y17, carry{$y16}, $y8, $y15);
-   add@(y18, $y16, $x₁);
-   addc@(y19, carry{$y18}, $y17, $x₂);
-   selc@(y20, carry{$y19}, RegZero, RegMod);
-   sub@(y21, $y19, $y20);
-   addm@(y22, $y21, RegZero, RegMod);
-   Ret $y22
+   mulll@(y1, RegPInv, $x₁);
+   add@(y2, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y0})), $y1);
+   add@(y3, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y})), $y2);
+   mulhh@(y4, RegMod, $y3);
+   mullh@(y5, RegMod, $y3);
+   mulhl@(y6, RegMod, $y3);
+   mulll@(y7, RegMod, $y3);
+   add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y6})), $y7);
+   addc@(y9, carry{$y8}, Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y5)), Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y6)));
+   add@(y10, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y5})), $y8);
+   addc@(y11, carry{$y10}, $y4, $y9);
+   add@(y12, $y10, $x₁);
+   addc@(y13, carry{$y12}, $y11, $x₂);
+   selc@(y14, carry{$y13}, RegZero, RegMod);
+   sub@(y15, $y13, $y14);
+   addm@(y16, $y15, RegZero, RegMod);
+   Ret $y16
   *)
 End Montgomery256.
 
@@ -10999,68 +11031,56 @@ Print Barrett256.barrett_red256.
 c.Selm($x0, $x_hi, RegZero, RegMuLow);
 c.Rshi($x1, RegZero, $x_hi, 255);
 c.Rshi($x2, $x_hi, $x_lo, 255);
-c.Mul128x128($x3, c.Upper(RegMuLow), c.Lower($x2));
-c.Mul128x128($x4, c.Lower(RegMuLow), c.Upper($x2));
-c.Mul128x128($x5, c.Upper(RegMuLow), c.Upper($x2));
-c.ShiftL($x6, $x3, 128);
-c.ShiftR($x7, $x3, 128);
-c.ShiftL($x8, $x4, 128);
-c.ShiftR($x9, $x4, 128);
-c.Mul128x128($x10, c.Lower(RegMuLow), c.Lower($x2));
-c.Add256($x11, $x8, $x10);
-c.Addc128($x12, $x11_hi, $x7, $x9);
-c.Add256($x13, $x6, $x11_lo);
-c.Addc256($x14, $x13_hi, $x5, $x12_lo);
-c.Add256($x15, $x2, $x14_lo);
-c.Addc128($x16, $x15_hi, RegZero, $x1);
-c.Add256($x17, $x0, $x15_lo);
-c.Addc128($x18, $x17_hi, RegZero, $x16_lo);
-c.Rshi($x19, $x18_lo, $x17_lo, 1);
-c.Mul128x128($x20, c.Lower(RegMod), c.Upper($x19));
-c.Mul128x128($x21, c.Upper(RegMod), c.Lower($x19));
-c.Mul128x128($x22, c.Upper(RegMod), c.Upper($x19));
-c.ShiftL($x23, $x20, 128);
-c.ShiftR($x24, $x20, 128);
-c.ShiftL($x25, $x21, 128);
-c.ShiftR($x26, $x21, 128);
-c.Mul128x128($x27, c.Lower(RegMod), c.Lower($x19));
-c.Add256($x28, $x25, $x27);
-c.Addc256($x29, $x28_hi, $x24, $x26);
-c.Add256($x30, $x23, $x28_lo);
-c.Addc256($x31, $x30_hi, $x22, $x29_lo);
-expr_let x32 := Z.add_get_carry_concrete 115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
-                (Z.opp @@ $x30_lo, $x_lo) in
-expr_let x33 := Z.add_with_get_carry_concrete 115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
-                ($x32_hi, Z.opp @@ $x31_lo, $x_hi) in
-c.Sell($x34, $x33_lo, RegZero, RegMod);
-c.Sub($x35, $x32_lo, $x34);
-c.AddM($ret, $x35, RegZero, RegMod);
+c.Mul128x128($x3, c.Upper(RegMuLow), c.Upper($x2));
+c.Mul128x128($x4, c.Upper(RegMuLow), c.Lower($x2));
+c.Mul128x128($x5, c.Lower(RegMuLow), c.Upper($x2));
+c.Mul128x128($x6, c.Lower(RegMuLow), c.Lower($x2));
+c.Add256($x7, (c.Lower($x5) << 128), $x6);
+c.Addc128($x8, $x7_hi, c.Upper($x4), c.Upper($x5));
+c.Add256($x9, (c.Lower($x4) << 128), $x7_lo);
+c.Addc256($x10, $x9_hi, $x3, $x8_lo);
+c.Add256($x11, $x2, $x10_lo);
+c.Addc128($x12, $x11_hi, RegZero, $x1);
+c.Add256($x13, $x0, $x11_lo);
+c.Addc128($x14, $x13_hi, RegZero, $x12_lo);
+c.Rshi($x15, $x14_lo, $x13_lo, 1);
+c.Mul128x128($x16, c.Upper(RegMod), c.Upper($x15));
+c.Mul128x128($x17, c.Lower(RegMod), c.Upper($x15));
+c.Mul128x128($x18, c.Upper(RegMod), c.Lower($x15));
+c.Mul128x128($x19, c.Lower(RegMod), c.Lower($x15));
+c.Add256($x20, (c.Lower($x18) << 128), $x19);
+c.Addc256($x21, $x20_hi, c.Upper($x17), c.Upper($x18));
+c.Add256($x22, (c.Lower($x17) << 128), $x20_lo);
+c.Addc256($x23, $x22_hi, $x16, $x21_lo);
+expr_let x24 := Z.add_get_carry_concrete
+                  115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                (Z.opp @@ $x22_lo, $x_lo) in
+expr_let x25 := Z.add_with_get_carry_concrete
+                  115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                ($x24_hi, Z.opp @@ $x23_lo, $x_hi) in
+c.Sell($x26, $x25_lo, RegZero, RegMod);
+c.Sub($x27, $x24_lo, $x26);
+c.AddM($ret, $x27, RegZero, RegMod);
 *)
 
 Print Montgomery256.montred256.
 (*
 c.Mul128x128($x0, c.Lower(RegPinv), c.Upper($x_lo));
 c.Mul128x128($x1, c.Upper(RegPinv), c.Lower($x_lo));
-c.ShiftL($x2, $x0, 128);
-c.ShiftL($x3, $x1, 128);
-c.Mul128x128($x4, c.Lower(RegPinv), c.Lower($x_lo));
-c.Add256($x5, $x3, $x4);
-c.Add256($x6, $x2, $x5_lo);
-c.Mul128x128($x7, c.Lower(RegMod), c.Upper($x6_lo));
-c.Mul128x128($x8, c.Upper(RegMod), c.Lower($x6_lo));
-c.Mul128x128($x9, c.Upper(RegMod), c.Upper($x6_lo));
-c.ShiftL($x10, $x7, 128);
-c.ShiftR($x11, $x7, 128);
-c.ShiftL($x12, $x8, 128);
-c.ShiftR($x13, $x8, 128);
-c.Mul128x128($x14, c.Lower(RegMod), c.Lower($x6_lo));
-c.Add256($x15, $x12, $x14);
-c.Addc256($x16, $x15_hi, $x11, $x13);
-c.Add256($x17, $x10, $x15_lo);
-c.Addc256($x18, $x17_hi, $x9, $x16_lo);
-c.Add256($x19, $x17_lo, $x_lo);
-c.Addc256($x20, $x19_hi, $x18_lo, $x_hi);
-c.Selc($x21, $x20_hi, RegZero, RegMod);
-c.Sub($x22, $x20_lo, $x21);
-c.AddM($ret, $x22_lo, RegZero, RegMod);
+c.Mul128x128($x2, c.Lower(RegPinv), c.Lower($x_lo));
+c.Add256($x3, (c.Lower($x1) << 128), $x2);
+c.Add256($x4, (c.Lower($x0) << 128), $x3_lo);
+c.Mul128x128($x5, c.Upper(RegMod), c.Upper($x4_lo));
+c.Mul128x128($x6, c.Lower(RegMod), c.Upper($x4_lo));
+c.Mul128x128($x7, c.Upper(RegMod), c.Lower($x4_lo));
+c.Mul128x128($x8, c.Lower(RegMod), c.Lower($x4_lo));
+c.Add256($x9, (c.Lower($x7) << 128), $x8);
+c.Addc256($x10, $x9_hi, c.Upper($x6), c.Upper($x7));
+c.Add256($x11, (c.Lower($x6) << 128), $x9_lo);
+c.Addc256($x12, $x11_hi, $x5, $x10_lo);
+c.Add256($x13, $x11_lo, $x_lo);
+c.Addc256($x14, $x13_hi, $x12_lo, $x_hi);
+c.Selc($x15, $x14_hi, RegZero, RegMod);
+c.Sub($x16, $x14_lo, $x15);
+c.AddM($ret, $x16_lo, RegZero, RegMod);
  *)
author	Jade Philipoom <jadep@google.com>	2018-05-17 01:23:27 +0200
committer	Jade Philipoom <jadep@google.com>	2018-05-31 13:46:48 +0200
commit	9affad3455894694ccaa62f311c2f56a2c708b61 (patch)
tree	7e4ee44c2a59f290f3be283ffca8792802de5fdd
parent	de9687da6f873de2f481524d252a238655a6e9f9 (diff)