From 0dd2e5f84b91f3af22a37f87eeea476f53445535 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 17 May 2018 19:24:41 +0200
Subject: change order of additions so that they a) make more sense and b) are
 more suited to shifting adds

---
 src/Experiments/SimplyTypedArithmetic.v | 445 ++++++++++++++++----------------
 1 file changed, 226 insertions(+), 219 deletions(-)

(limited to 'src/Experiments/SimplyTypedArithmetic.v')

diff --git a/src/Experiments/SimplyTypedArithmetic.v b/src/Experiments/SimplyTypedArithmetic.v
index 416938689..40cc90157 100644
--- a/src/Experiments/SimplyTypedArithmetic.v
+++ b/src/Experiments/SimplyTypedArithmetic.v
@@ -1069,6 +1069,9 @@ Module Rows.
              | H: forall x, In x (?y :: ?ls) -> ?P |- _ =>
                unique pose proof (H y ltac:(apply in_eq));
                unique assert (forall x, In x ls -> P) by auto
+             | H: forall x, In x (?ls ++ ?y :: nil) -> ?P |- _ =>
+               unique pose proof (H y ltac:(auto using in_or_app, in_eq));
+               unique assert (forall x, In x ls -> P) by eauto using in_or_app
              end.
 
     Section FromAssociational.
@@ -1442,7 +1445,7 @@ Module Rows.
       Definition flatten' (start_state : list Z * Z) (inp : rows) : list Z * Z :=
         fold_right (fun next_row (state : list Z * Z)=>
                       let out_carry := sum_rows next_row (fst state) in
-                      (fst out_carry, snd state + snd out_carry)) start_state (rev inp).
+                      (fst out_carry, snd state + snd out_carry)) start_state inp.
 
       (* In order for the output to have the right length and bounds,
          we insert rows of zeroes if there are fewer than two rows. *)
@@ -1451,10 +1454,13 @@ Module Rows.
         flatten' (hd default inp, 0) (hd default (tl inp) :: tl (tl inp)).
 
       Lemma flatten'_cons state r inp :
-        flatten' state (r :: inp) = flatten' (fst (sum_rows r (fst state)), snd state + snd (sum_rows r (fst state))) inp.
+        flatten' state (r :: inp) = (fst (sum_rows r (fst (flatten' state inp))), snd (flatten' state inp) + snd (sum_rows r (fst (flatten' state inp)))).
+      Proof. cbv [flatten']; autorewrite with list push_fold_right. reflexivity. Qed.
+      Lemma flatten'_snoc state r inp :
+        flatten' state (inp ++ r :: nil) = flatten' (fst (sum_rows r (fst state)), snd state + snd (sum_rows r (fst state))) inp.
       Proof. cbv [flatten']; autorewrite with list push_fold_right. reflexivity. Qed.
       Lemma flatten'_nil state : flatten' state [] = state. Proof. reflexivity. Qed.
-      Hint Rewrite flatten'_cons flatten'_nil : push_flatten.
+      Hint Rewrite flatten'_cons flatten'_snoc flatten'_nil : push_flatten.
 
       Ltac push :=
         repeat match goal with
@@ -1466,6 +1472,7 @@ Module Rows.
                | _ => progress In_cases
                | |- _ /\ _ => split
                | |- context [?x mod ?y] => unique pose proof (Z.mul_div_eq_full x y ltac:(auto)); lia
+                                                                                                    | _ => apply length_sum_rows
                | _ => solve [repeat (ring_simplify; f_equal; try ring)]
                | _ => congruence
                | _ => solve [eauto]
@@ -1480,7 +1487,7 @@ Module Rows.
                        (Positional.eval weight n (fst start_state) + eval n inp + weight n * snd start_state)
                        (weight n)).
       Proof.
-        induction inp; push; [apply IHinp; push|].
+        induction inp using rev_ind; push; [apply IHinp; push|].
         destruct (dec (inp = nil)); [subst inp; cbv [is_div_mod]
                                     | eapply is_div_mod_result_equal; try apply IHinp]; push.
         { autorewrite with zsimplify; push. }
@@ -1537,7 +1544,7 @@ Module Rows.
                   nth_default 0 (fst (flatten' start_state inp)) i
                   = ((Positional.eval weight n (fst start_state) + eval n inp) mod weight (S i)) / (weight i).
       Proof.
-        induction inp; push.
+        induction inp using rev_ind; push.
         destruct (dec (inp = nil)).
         { subst inp; push. rewrite sum_rows_partitions with (n:=n) by eauto. push. }
         { erewrite IHinp; push.
@@ -10086,23 +10093,23 @@ Module Barrett256.
   (* Note: the aforementioned ADD/ADDC instructions currently *do* fail to bounds-check, although the pipeline doesn't give an error.
      This is why their results are not cast (because the carry has range [-1~>0]). *)
   (*
-barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.Z * type.type_primitive type.Z)%ctype,
+barrett_red256 =  fun var : type -> Type => λ x : var (type.type_primitive type.Z * type.type_primitive type.Z)%ctype,
                 expr_let x0 := SELM (x₂, 0,
-                            26959946667150639793205513449348445388433292963828203772348655992835) in
+                               26959946667150639793205513449348445388433292963828203772348655992835) in
                 expr_let x1 := RSHI (0, x₂, 255) in
                 expr_let x2 := RSHI (x₂, x₁, 255) in
                 expr_let x3 := 79228162514264337589248983038 *₂₅₆ (uint128)(x2 >> 128) in
                 expr_let x4 := 79228162514264337589248983038 *₂₅₆
-                            ((uint128)(x2) & 340282366920938463463374607431768211455) in
+                               ((uint128)(x2) & 340282366920938463463374607431768211455) in
                 expr_let x5 := 340282366841710300930663525764514709507 *₂₅₆ (uint128)(x2 >> 128) in
                 expr_let x6 := 340282366841710300930663525764514709507 *₂₅₆
-                            ((uint128)(x2) & 340282366920938463463374607431768211455) in
-                expr_let x7 := ADD_256 ((uint256)(((uint128)(x5) & 340282366920938463463374607431768211455) << 128),
-                            x6) in
-                expr_let x8 := ADDC_128 (x7₂, (uint128)(x4 >> 128), (uint128)(x5 >> 128)) in
-                expr_let x9 := ADD_256 ((uint256)(((uint128)(x4) & 340282366920938463463374607431768211455) << 128),
-                            x7₁) in
-                expr_let x10 := ADDC_256 (x9₂, x3, x8₁) in
+                               ((uint128)(x2) & 340282366920938463463374607431768211455) in
+                expr_let x7 := ADD_256 ((uint256)(((uint128)(x4) & 340282366920938463463374607431768211455) << 128),
+                               x6) in
+                expr_let x8 := ADDC_256 (x7₂, x3, (uint128)(x5 >> 128)) in
+                expr_let x9 := ADD_256 ((uint256)(((uint128)(x5) & 340282366920938463463374607431768211455) << 128),
+                               x7₁) in
+                expr_let x10 := ADDC_256 (x9₂, (uint128)(x4 >> 128), x8₁) in
                 expr_let x11 := ADD_256 (x2, x10₁) in
                 expr_let x12 := ADDC_128 (x11₂, 0, x1) in
                 expr_let x13 := ADD_256 (x0, x11₁) in
@@ -10114,17 +10121,17 @@ barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.
                                 ((uint128)(x15) & 340282366920938463463374607431768211455) in
                 expr_let x19 := 79228162514264337593543950335 *₂₅₆
                                 ((uint128)(x15) & 340282366920938463463374607431768211455) in
-                expr_let x20 := ADD_256 ((uint256)(((uint128)(x18) & 340282366920938463463374607431768211455) << 128),
+                expr_let x20 := ADD_256 ((uint256)(((uint128)(x17) & 340282366920938463463374607431768211455) << 128),
                                 x19) in
-                expr_let x21 := ADDC_256 (x20₂, (uint128)(x17 >> 128), (uint128)(x18 >> 128)) in
-                expr_let x22 := ADD_256 ((uint256)(((uint128)(x17) & 340282366920938463463374607431768211455) << 128),
+                expr_let x21 := ADDC_256 (x20₂, x16, (uint128)(x18 >> 128)) in
+                expr_let x22 := ADD_256 ((uint256)(((uint128)(x18) & 340282366920938463463374607431768211455) << 128),
                                 x20₁) in
-                expr_let x23 := ADDC_256 (x22₂, x16, x21₁) in
+                expr_let x23 := ADDC_256 (x22₂, (uint128)(x17 >> 128), x21₁) in
                 expr_let x24 := Z.add_get_carry_concrete
-                                115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                                  115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
                                 (Z.opp @@ (fst @@ x22), x₁) in
                 expr_let x25 := Z.add_with_get_carry_concrete
-                                115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
+                                  115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
                                 (x24₂, Z.opp @@ (fst @@ x23), x₂) in
                 expr_let x26 := SELL (x25₁, 0,
                                 115792089210356248762697446949407573530086143415290314195533631308867097853951) in
@@ -10152,12 +10159,12 @@ barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.
     mulhl@(y3, RegMuLow, $y1);
     mullh@(y4, RegMuLow, $y1);
     mulll@(y5, RegMuLow, $y1);
-    add@(y6, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y4})), $y5);
-    addc@(y7, carry{$y6}, Straightline.expr.Cast uint128
-                            (Straightline.expr.Shiftr 128 ($y3)), 
-    Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y4)));
-    add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y3})), $y6);
-    addc@(y9, carry{$y8}, $y2, $y7);
+    add@(y6, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y3})), $y5);
+    addc@(y7, carry{$y6}, $y2, Straightline.expr.Cast uint128
+                                (Straightline.expr.Shiftr 128 ($y4)));
+    add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y4})), $y6);
+    addc@(y9, carry{$y8}, Straightline.expr.Cast uint128
+                            (Straightline.expr.Shiftr 128 ($y3)), $y7);
     add@(y10, $y1, $y9);
     addc@(y11, carry{$y10}, RegZero, $y0);
     add@(y12, $y, $y10);
@@ -10168,14 +10175,14 @@ barrett_red256 = fun var : type -> Type => λ x : var (type.type_primitive type.
     mulhl@(y17, RegMod, $y14);
     mulll@(y18, RegMod, $y14);
     add@(y19, Straightline.expr.Cast uint256
-                (Straightline.expr.Shiftl 128 (Lower{$y17})), $y18);
-    addc@(y20, carry{$y19}, Straightline.expr.Cast uint128
-                            (Straightline.expr.Shiftr 128 ($y16)), 
-    Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y17)));
+                (Straightline.expr.Shiftl 128 (Lower{$y16})), $y18);
+    addc@(y20, carry{$y19}, $y15, Straightline.expr.Cast uint128
+                                    (Straightline.expr.Shiftr 128 ($y17)));
     add@(y21, Straightline.expr.Cast uint256
-                (Straightline.expr.Shiftl 128 (Lower{$y16})), $y19);
-    addc@(_, carry{$y21}, $y15, $y20);
-    Straightline.expr.Scalar (Straightline.expr.Primitive (-1)) 
+                (Straightline.expr.Shiftl 128 (Lower{$y17})), $y19);
+    addc@(_, carry{$y21}, Straightline.expr.Cast uint128
+                            (Straightline.expr.Shiftr 128 ($y16)), $y20);
+    Straightline.expr.Scalar (Straightline.expr.Primitive (-1))
   *)
 End Barrett256.
 
@@ -10324,35 +10331,35 @@ mulmod = fun var : type -> Type => λ x : var (type.list (type.type_primitive ty
          expr_let x6 := mul64 ((uint64)(x₁[[0]]), (uint64)(x₂[[2]])) in
          expr_let x7 := mul64 ((uint64)(x₁[[0]]), (uint64)(x₂[[1]])) in
          expr_let x8 := mul64 ((uint64)(x₁[[0]]), (uint64)(x₂[[0]])) in
-         expr_let x9 := add64 (x6₂, x8₁) in
-         expr_let x10 := adc64 (x9₂, x7₁, x8₂) in
-         expr_let x11 := adc64 (x10₂, x6₁, x7₂) in
-         expr_let x12 := add64 (x4₂, x9₁) in
-         expr_let x13 := adc64 (x12₂, x5₁, x10₁) in
-         expr_let x14 := adc64 (x13₂, x5₂, x11₁) in
-         expr_let x15 := add64 (x3₁, x12₁) in
-         expr_let x16 := adc64 (x15₂, x6₂, x13₁) in
-         expr_let x17 := adc64 (x16₂, x4₁, x14₁) in
-         expr_let x18 := add64 (x2₂, x15₁) in
-         expr_let x19 := adc64 (x18₂, x4₂, x16₁) in
-         expr_let x20 := adc64 (x19₂, x2₁, x17₁) in
-         expr_let x21 := add64 (x1₁, x18₁) in
-         expr_let x22 := adc64 (x21₂, x3₁, x19₁) in
-         expr_let x23 := adc64 (x22₂, x3₂, x20₁) in
-         expr_let x24 := add64 (x0₂, x21₁) in
-         expr_let x25 := adc64 (x24₂, x2₂, x22₁) in
-         expr_let x26 := adc64 (x25₂, x1₂, x23₁) in
-         expr_let x27 := add64 (x1₁, x25₁) in
-         expr_let x28 := adc64 (x27₂, x0₁, x26₁) in
-         expr_let x29 := add64 (x3₂, x27₁) in
-         expr_let x30 := adc64 (x29₂, x0₂, x28₁) in
-         expr_let x31 := add64 (x1₂, x29₁) in
-         expr_let x32 := adc64 (x31₂, 0, x30₁) in
-         expr_let x33 := add64 (x0₁, x31₁) in
-         expr_let x34 := adc64 (x33₂, 0, x32₁) in
-         expr_let x35 := add64 (x0₂, x33₁) in
-         expr_let x36 := adc64 (x35₂, 0, x34₁) in
-         x24₁ :: x35₁ :: x36₁ :: []
+         expr_let x9 := add64 (x0₂, x8₂) in
+         expr_let x10 := adc64 (x9₂, 0, x7₂) in
+         expr_let x11 := add64 (x0₁, x9₁) in
+         expr_let x12 := adc64 (x11₂, 0, x10₁) in
+         expr_let x13 := add64 (x1₂, x11₁) in
+         expr_let x14 := adc64 (x13₂, 0, x12₁) in
+         expr_let x15 := add64 (x3₂, x13₁) in
+         expr_let x16 := adc64 (x15₂, x0₂, x14₁) in
+         expr_let x17 := add64 (x1₁, x15₁) in
+         expr_let x18 := adc64 (x17₂, x0₁, x16₁) in
+         expr_let x19 := add64 (x0₂, x8₁) in
+         expr_let x20 := adc64 (x19₂, x2₂, x17₁) in
+         expr_let x21 := adc64 (x20₂, x1₂, x18₁) in
+         expr_let x22 := add64 (x1₁, x19₁) in
+         expr_let x23 := adc64 (x22₂, x3₁, x20₁) in
+         expr_let x24 := adc64 (x23₂, x3₂, x21₁) in
+         expr_let x25 := add64 (x2₂, x22₁) in
+         expr_let x26 := adc64 (x25₂, x4₂, x23₁) in
+         expr_let x27 := adc64 (x26₂, x2₁, x24₁) in
+         expr_let x28 := add64 (x3₁, x25₁) in
+         expr_let x29 := adc64 (x28₂, x6₂, x26₁) in
+         expr_let x30 := adc64 (x29₂, x4₁, x27₁) in
+         expr_let x31 := add64 (x4₂, x28₁) in
+         expr_let x32 := adc64 (x31₂, x5₁, x29₁) in
+         expr_let x33 := adc64 (x32₂, x5₂, x30₁) in
+         expr_let x34 := add64 (x6₂, x31₁) in
+         expr_let x35 := adc64 (x34₂, x7₁, x32₁) in
+         expr_let x36 := adc64 (x35₂, x6₁, x33₁) in
+         x34₁ :: x35₁ :: x36₁ :: []
      : Expr (type.uncurry (type.list (type.type_primitive type.Z) -> type.list (type.type_primitive type.Z) -> type.list (type.type_primitive type.Z)))
 *)
 
@@ -10419,135 +10426,135 @@ mulmod = fun var : type -> Type => λ x : var (type.list (type.type_primitive ty
          expr_let x33 := mul32 ((uint32)(x₁[[0]]), (uint32)(x₂[[2]])) in
          expr_let x34 := mul32 ((uint32)(x₁[[0]]), (uint32)(x₂[[1]])) in
          expr_let x35 := mul32 ((uint32)(x₁[[0]]), (uint32)(x₂[[0]])) in
-         expr_let x36 := add32 (x30₂, x35₁) in
-         expr_let x37 := adc32 (x36₂, x34₁, x35₂) in
-         expr_let x38 := adc32 (x37₂, x33₁, x34₂) in
-         expr_let x39 := adc32 (x38₂, x32₁, x33₂) in
-         expr_let x40 := adc32 (x39₂, x31₁, x32₂) in
-         expr_let x41 := adc32 (x40₂, x30₁, x31₂) in
-         expr_let x42 := add32 (x25₂, x36₁) in
-         expr_let x43 := adc32 (x42₂, x29₁, x37₁) in
-         expr_let x44 := adc32 (x43₂, x29₂, x38₁) in
-         expr_let x45 := adc32 (x44₂, x28₂, x39₁) in
-         expr_let x46 := adc32 (x45₂, x27₂, x40₁) in
-         expr_let x47 := adc32 (x46₂, x26₂, x41₁) in
-         expr_let x48 := add32 (x24₁, x42₁) in
-         expr_let x49 := adc32 (x48₂, x24₂, x43₁) in
-         expr_let x50 := adc32 (x49₂, x28₁, x44₁) in
-         expr_let x51 := adc32 (x50₂, x27₁, x45₁) in
-         expr_let x52 := adc32 (x51₂, x26₁, x46₁) in
-         expr_let x53 := adc32 (x52₂, x25₁, x47₁) in
-         expr_let x54 := add32 (x20₂, x48₁) in
-         expr_let x55 := adc32 (x54₂, x19₂, x49₁) in
-         expr_let x56 := adc32 (x55₂, x23₁, x50₁) in
-         expr_let x57 := adc32 (x56₂, x23₂, x51₁) in
-         expr_let x58 := adc32 (x57₂, x22₂, x52₁) in
-         expr_let x59 := adc32 (x58₂, x21₂, x53₁) in
-         expr_let x60 := add32 (x19₁, x54₁) in
-         expr_let x61 := adc32 (x60₂, x18₁, x55₁) in
-         expr_let x62 := adc32 (x61₂, x30₂, x56₁) in
-         expr_let x63 := adc32 (x62₂, x22₁, x57₁) in
-         expr_let x64 := adc32 (x63₂, x21₁, x58₁) in
-         expr_let x65 := adc32 (x64₂, x20₁, x59₁) in
-         expr_let x66 := add32 (x15₂, x60₁) in
-         expr_let x67 := adc32 (x66₂, x14₂, x61₁) in
-         expr_let x68 := adc32 (x67₂, x25₂, x62₁) in
-         expr_let x69 := adc32 (x68₂, x17₁, x63₁) in
-         expr_let x70 := adc32 (x69₂, x17₂, x64₁) in
-         expr_let x71 := adc32 (x70₂, x16₂, x65₁) in
-         expr_let x72 := add32 (x14₁, x66₁) in
-         expr_let x73 := adc32 (x72₂, x13₁, x67₁) in
-         expr_let x74 := adc32 (x73₂, x24₁, x68₁) in
-         expr_let x75 := adc32 (x74₂, x24₂, x69₁) in
-         expr_let x76 := adc32 (x75₂, x16₁, x70₁) in
-         expr_let x77 := adc32 (x76₂, x15₁, x71₁) in
-         expr_let x78 := add32 (x10₂, x72₁) in
-         expr_let x79 := adc32 (x78₂, x9₂, x73₁) in
-         expr_let x80 := adc32 (x79₂, x20₂, x74₁) in
-         expr_let x81 := adc32 (x80₂, x19₂, x75₁) in
-         expr_let x82 := adc32 (x81₂, x11₁, x76₁) in
-         expr_let x83 := adc32 (x82₂, x11₂, x77₁) in
-         expr_let x84 := add32 (x9₁, x78₁) in
-         expr_let x85 := adc32 (x84₂, x8₁, x79₁) in
-         expr_let x86 := adc32 (x85₂, x19₁, x80₁) in
-         expr_let x87 := adc32 (x86₂, x18₁, x81₁) in
-         expr_let x88 := adc32 (x87₂, x18₂, x82₁) in
-         expr_let x89 := adc32 (x88₂, x10₁, x83₁) in
-         expr_let x90 := add32 (x5₂, x84₁) in
-         expr_let x91 := adc32 (x90₂, x4₂, x85₁) in
-         expr_let x92 := adc32 (x91₂, x15₂, x86₁) in
-         expr_let x93 := adc32 (x92₂, x14₂, x87₁) in
-         expr_let x94 := adc32 (x93₂, x13₂, x88₁) in
-         expr_let x95 := adc32 (x94₂, x5₁, x89₁) in
-         expr_let x96 := add32 (x4₁, x90₁) in
-         expr_let x97 := adc32 (x96₂, x3₁, x91₁) in
-         expr_let x98 := adc32 (x97₂, x14₁, x92₁) in
-         expr_let x99 := adc32 (x98₂, x13₁, x93₁) in
-         expr_let x100 := adc32 (x99₂, x12₁, x94₁) in
-         expr_let x101 := adc32 (x100₂, x12₂, x95₁) in
-         expr_let x102 := add32 (x6₂, x96₁) in
-         expr_let x103 := adc32 (x102₂, x0₂, x97₁) in
-         expr_let x104 := adc32 (x103₂, x10₂, x98₁) in
-         expr_let x105 := adc32 (x104₂, x9₂, x99₁) in
-         expr_let x106 := adc32 (x105₂, x8₂, x100₁) in
-         expr_let x107 := adc32 (x106₂, x7₂, x101₁) in
-         expr_let x108 := add32 (x1₂, x102₁) in
-         expr_let x109 := adc32 (x108₂, 0, x103₁) in
-         expr_let x110 := adc32 (x109₂, x9₁, x104₁) in
+         expr_let x36 := add32 (x0₁, x34₂) in
+         expr_let x37 := adc32 (x36₂, 0, x33₂) in
+         expr_let x38 := adc32 (x37₂, 0, x32₂) in
+         expr_let x39 := adc32 (x38₂, 0, x31₂) in
+         expr_let x40 := add32 (x1₂, x36₁) in
+         expr_let x41 := adc32 (x40₂, 0, x37₁) in
+         expr_let x42 := adc32 (x41₂, 0, x38₁) in
+         expr_let x43 := adc32 (x42₂, 0, x39₁) in
+         expr_let x44 := add32 (x6₂, x40₁) in
+         expr_let x45 := adc32 (x44₂, 0, x41₁) in
+         expr_let x46 := adc32 (x45₂, 0, x42₁) in
+         expr_let x47 := adc32 (x46₂, 0, x43₁) in
+         expr_let x48 := add32 (x2₁, x44₁) in
+         expr_let x49 := adc32 (x48₂, 0, x45₁) in
+         expr_let x50 := adc32 (x49₂, 0, x46₁) in
+         expr_let x51 := adc32 (x50₂, 0, x47₁) in
+         expr_let x52 := add32 (x3₂, x48₁) in
+         expr_let x53 := adc32 (x52₂, x0₂, x49₁) in
+         expr_let x54 := adc32 (x53₂, 0, x50₁) in
+         expr_let x55 := adc32 (x54₂, 0, x51₁) in
+         expr_let x56 := add32 (x7₁, x52₁) in
+         expr_let x57 := adc32 (x56₂, x1₁, x53₁) in
+         expr_let x58 := adc32 (x57₂, 0, x54₁) in
+         expr_let x59 := adc32 (x58₂, 0, x55₁) in
+         expr_let x60 := add32 (x8₂, x56₁) in
+         expr_let x61 := adc32 (x60₂, x2₂, x57₁) in
+         expr_let x62 := adc32 (x61₂, 0, x58₁) in
+         expr_let x63 := adc32 (x62₂, 0, x59₁) in
+         expr_let x64 := add32 (x12₁, x60₁) in
+         expr_let x65 := adc32 (x64₂, x6₁, x61₁) in
+         expr_let x66 := adc32 (x65₂, x0₁, x62₁) in
+         expr_let x67 := adc32 (x66₂, 0, x63₁) in
+         expr_let x68 := add32 (x13₂, x64₁) in
+         expr_let x69 := adc32 (x68₂, x7₂, x65₁) in
+         expr_let x70 := adc32 (x69₂, x1₂, x66₁) in
+         expr_let x71 := adc32 (x70₂, 0, x67₁) in
+         expr_let x72 := add32 (x18₂, x68₁) in
+         expr_let x73 := adc32 (x72₂, x12₂, x69₁) in
+         expr_let x74 := adc32 (x73₂, x6₂, x70₁) in
+         expr_let x75 := adc32 (x74₂, x0₂, x71₁) in
+         expr_let x76 := add32 (x4₁, x72₁) in
+         expr_let x77 := adc32 (x76₂, x3₁, x73₁) in
+         expr_let x78 := adc32 (x77₂, x2₁, x74₁) in
+         expr_let x79 := adc32 (x78₂, x1₁, x75₁) in
+         expr_let x80 := add32 (x0₁, x35₁) in
+         expr_let x81 := adc32 (x80₂, 0, x35₂) in
+         expr_let x82 := adc32 (x81₂, x5₂, x76₁) in
+         expr_let x83 := adc32 (x82₂, x4₂, x77₁) in
+         expr_let x84 := adc32 (x83₂, x3₂, x78₁) in
+         expr_let x85 := adc32 (x84₂, x2₂, x79₁) in
+         expr_let x86 := add32 (x1₂, x80₁) in
+         expr_let x87 := adc32 (x86₂, 0, x81₁) in
+         expr_let x88 := adc32 (x87₂, x9₁, x82₁) in
+         expr_let x89 := adc32 (x88₂, x8₁, x83₁) in
+         expr_let x90 := adc32 (x89₂, x7₁, x84₁) in
+         expr_let x91 := adc32 (x90₂, x6₁, x85₁) in
+         expr_let x92 := add32 (x6₂, x86₁) in
+         expr_let x93 := adc32 (x92₂, x0₂, x87₁) in
+         expr_let x94 := adc32 (x93₂, x10₂, x88₁) in
+         expr_let x95 := adc32 (x94₂, x9₂, x89₁) in
+         expr_let x96 := adc32 (x95₂, x8₂, x90₁) in
+         expr_let x97 := adc32 (x96₂, x7₂, x91₁) in
+         expr_let x98 := add32 (x4₁, x92₁) in
+         expr_let x99 := adc32 (x98₂, x3₁, x93₁) in
+         expr_let x100 := adc32 (x99₂, x14₁, x94₁) in
+         expr_let x101 := adc32 (x100₂, x13₁, x95₁) in
+         expr_let x102 := adc32 (x101₂, x12₁, x96₁) in
+         expr_let x103 := adc32 (x102₂, x12₂, x97₁) in
+         expr_let x104 := add32 (x5₂, x98₁) in
+         expr_let x105 := adc32 (x104₂, x4₂, x99₁) in
+         expr_let x106 := adc32 (x105₂, x15₂, x100₁) in
+         expr_let x107 := adc32 (x106₂, x14₂, x101₁) in
+         expr_let x108 := adc32 (x107₂, x13₂, x102₁) in
+         expr_let x109 := adc32 (x108₂, x5₁, x103₁) in
+         expr_let x110 := add32 (x9₁, x104₁) in
          expr_let x111 := adc32 (x110₂, x8₁, x105₁) in
-         expr_let x112 := adc32 (x111₂, x7₁, x106₁) in
-         expr_let x113 := adc32 (x112₂, x6₁, x107₁) in
-         expr_let x114 := add32 (x0₁, x108₁) in
-         expr_let x115 := adc32 (x114₂, 0, x109₁) in
-         expr_let x116 := adc32 (x115₂, x5₂, x110₁) in
-         expr_let x117 := adc32 (x116₂, x4₂, x111₁) in
-         expr_let x118 := adc32 (x117₂, x3₂, x112₁) in
-         expr_let x119 := adc32 (x118₂, x2₂, x113₁) in
-         expr_let x120 := add32 (x4₁, x116₁) in
-         expr_let x121 := adc32 (x120₂, x3₁, x117₁) in
-         expr_let x122 := adc32 (x121₂, x2₁, x118₁) in
-         expr_let x123 := adc32 (x122₂, x1₁, x119₁) in
-         expr_let x124 := add32 (x18₂, x120₁) in
-         expr_let x125 := adc32 (x124₂, x12₂, x121₁) in
-         expr_let x126 := adc32 (x125₂, x6₂, x122₁) in
-         expr_let x127 := adc32 (x126₂, x0₂, x123₁) in
-         expr_let x128 := add32 (x13₂, x124₁) in
-         expr_let x129 := adc32 (x128₂, x7₂, x125₁) in
-         expr_let x130 := adc32 (x129₂, x1₂, x126₁) in
-         expr_let x131 := adc32 (x130₂, 0, x127₁) in
-         expr_let x132 := add32 (x12₁, x128₁) in
-         expr_let x133 := adc32 (x132₂, x6₁, x129₁) in
-         expr_let x134 := adc32 (x133₂, x0₁, x130₁) in
-         expr_let x135 := adc32 (x134₂, 0, x131₁) in
-         expr_let x136 := add32 (x8₂, x132₁) in
-         expr_let x137 := adc32 (x136₂, x2₂, x133₁) in
-         expr_let x138 := adc32 (x137₂, 0, x134₁) in
-         expr_let x139 := adc32 (x138₂, 0, x135₁) in
-         expr_let x140 := add32 (x7₁, x136₁) in
-         expr_let x141 := adc32 (x140₂, x1₁, x137₁) in
-         expr_let x142 := adc32 (x141₂, 0, x138₁) in
-         expr_let x143 := adc32 (x142₂, 0, x139₁) in
-         expr_let x144 := add32 (x3₂, x140₁) in
-         expr_let x145 := adc32 (x144₂, x0₂, x141₁) in
-         expr_let x146 := adc32 (x145₂, 0, x142₁) in
-         expr_let x147 := adc32 (x146₂, 0, x143₁) in
-         expr_let x148 := add32 (x2₁, x144₁) in
-         expr_let x149 := adc32 (x148₂, 0, x145₁) in
-         expr_let x150 := adc32 (x149₂, 0, x146₁) in
-         expr_let x151 := adc32 (x150₂, 0, x147₁) in
-         expr_let x152 := add32 (x6₂, x148₁) in
-         expr_let x153 := adc32 (x152₂, 0, x149₁) in
-         expr_let x154 := adc32 (x153₂, 0, x150₁) in
-         expr_let x155 := adc32 (x154₂, 0, x151₁) in
-         expr_let x156 := add32 (x1₂, x152₁) in
-         expr_let x157 := adc32 (x156₂, 0, x153₁) in
-         expr_let x158 := adc32 (x157₂, 0, x154₁) in
-         expr_let x159 := adc32 (x158₂, 0, x155₁) in
-         expr_let x160 := add32 (x0₁, x156₁) in
-         expr_let x161 := adc32 (x160₂, 0, x157₁) in
-         expr_let x162 := adc32 (x161₂, 0, x158₁) in
-         expr_let x163 := adc32 (x162₂, 0, x159₁) in
-         x114₁ :: x115₁ :: x160₁ :: x161₁ :: x162₁ :: x163₁ :: []
+         expr_let x112 := adc32 (x111₂, x19₁, x106₁) in
+         expr_let x113 := adc32 (x112₂, x18₁, x107₁) in
+         expr_let x114 := adc32 (x113₂, x18₂, x108₁) in
+         expr_let x115 := adc32 (x114₂, x10₁, x109₁) in
+         expr_let x116 := add32 (x10₂, x110₁) in
+         expr_let x117 := adc32 (x116₂, x9₂, x111₁) in
+         expr_let x118 := adc32 (x117₂, x20₂, x112₁) in
+         expr_let x119 := adc32 (x118₂, x19₂, x113₁) in
+         expr_let x120 := adc32 (x119₂, x11₁, x114₁) in
+         expr_let x121 := adc32 (x120₂, x11₂, x115₁) in
+         expr_let x122 := add32 (x14₁, x116₁) in
+         expr_let x123 := adc32 (x122₂, x13₁, x117₁) in
+         expr_let x124 := adc32 (x123₂, x24₁, x118₁) in
+         expr_let x125 := adc32 (x124₂, x24₂, x119₁) in
+         expr_let x126 := adc32 (x125₂, x16₁, x120₁) in
+         expr_let x127 := adc32 (x126₂, x15₁, x121₁) in
+         expr_let x128 := add32 (x15₂, x122₁) in
+         expr_let x129 := adc32 (x128₂, x14₂, x123₁) in
+         expr_let x130 := adc32 (x129₂, x25₂, x124₁) in
+         expr_let x131 := adc32 (x130₂, x17₁, x125₁) in
+         expr_let x132 := adc32 (x131₂, x17₂, x126₁) in
+         expr_let x133 := adc32 (x132₂, x16₂, x127₁) in
+         expr_let x134 := add32 (x19₁, x128₁) in
+         expr_let x135 := adc32 (x134₂, x18₁, x129₁) in
+         expr_let x136 := adc32 (x135₂, x30₂, x130₁) in
+         expr_let x137 := adc32 (x136₂, x22₁, x131₁) in
+         expr_let x138 := adc32 (x137₂, x21₁, x132₁) in
+         expr_let x139 := adc32 (x138₂, x20₁, x133₁) in
+         expr_let x140 := add32 (x20₂, x134₁) in
+         expr_let x141 := adc32 (x140₂, x19₂, x135₁) in
+         expr_let x142 := adc32 (x141₂, x23₁, x136₁) in
+         expr_let x143 := adc32 (x142₂, x23₂, x137₁) in
+         expr_let x144 := adc32 (x143₂, x22₂, x138₁) in
+         expr_let x145 := adc32 (x144₂, x21₂, x139₁) in
+         expr_let x146 := add32 (x24₁, x140₁) in
+         expr_let x147 := adc32 (x146₂, x24₂, x141₁) in
+         expr_let x148 := adc32 (x147₂, x28₁, x142₁) in
+         expr_let x149 := adc32 (x148₂, x27₁, x143₁) in
+         expr_let x150 := adc32 (x149₂, x26₁, x144₁) in
+         expr_let x151 := adc32 (x150₂, x25₁, x145₁) in
+         expr_let x152 := add32 (x25₂, x146₁) in
+         expr_let x153 := adc32 (x152₂, x29₁, x147₁) in
+         expr_let x154 := adc32 (x153₂, x29₂, x148₁) in
+         expr_let x155 := adc32 (x154₂, x28₂, x149₁) in
+         expr_let x156 := adc32 (x155₂, x27₂, x150₁) in
+         expr_let x157 := adc32 (x156₂, x26₂, x151₁) in
+         expr_let x158 := add32 (x30₂, x152₁) in
+         expr_let x159 := adc32 (x158₂, x34₁, x153₁) in
+         expr_let x160 := adc32 (x159₂, x33₁, x154₁) in
+         expr_let x161 := adc32 (x160₂, x32₁, x155₁) in
+         expr_let x162 := adc32 (x161₂, x31₁, x156₁) in
+         expr_let x163 := adc32 (x162₂, x30₁, x157₁) in
+         x158₁ :: x159₁ :: x160₁ :: x161₁ :: x162₁ :: x163₁ :: []
      : Expr (type.uncurry (type.list (type.type_primitive type.Z) -> type.list (type.type_primitive type.Z) -> type.list (type.type_primitive type.Z)))
 *)
 
@@ -10847,16 +10854,16 @@ montred256 = fun var : type -> Type => (λ x : var (type.type_primitive type.Z *
     expr_let x0 := 79228162514264337593543950337 *₂₅₆ (uint128)(x₁ >> 128) in
     expr_let x1 := 340282366841710300986003757985643364352 *₂₅₆ ((uint128)(x₁) & 340282366920938463463374607431768211455) in
     expr_let x2 := 79228162514264337593543950337 *₂₅₆ ((uint128)(x₁) & 340282366920938463463374607431768211455) in
-    expr_let x3 := ADD_256 ((uint256)(((uint128)(x1) & 340282366920938463463374607431768211455) << 128), x2) in
-    expr_let x4 := ADD_256 ((uint256)(((uint128)(x0) & 340282366920938463463374607431768211455) << 128), x3₁) in
+    expr_let x3 := ADD_256 ((uint256)(((uint128)(x0) & 340282366920938463463374607431768211455) << 128), x2) in
+    expr_let x4 := ADD_256 ((uint256)(((uint128)(x1) & 340282366920938463463374607431768211455) << 128), x3₁) in
     expr_let x5 := 340282366841710300967557013911933812736 *₂₅₆ (uint128)(x4₁ >> 128) in
     expr_let x6 := 79228162514264337593543950335 *₂₅₆ (uint128)(x4₁ >> 128) in
     expr_let x7 := 340282366841710300967557013911933812736 *₂₅₆ ((uint128)(x4₁) & 340282366920938463463374607431768211455) in
     expr_let x8 := 79228162514264337593543950335 *₂₅₆ ((uint128)(x4₁) & 340282366920938463463374607431768211455) in
-    expr_let x9 := ADD_256 ((uint256)(((uint128)(x7) & 340282366920938463463374607431768211455) << 128), x8) in
-    expr_let x10 := ADDC_256 (x9₂, (uint128)(x6 >> 128), (uint128)(x7 >> 128)) in
-    expr_let x11 := ADD_256 ((uint256)(((uint128)(x6) & 340282366920938463463374607431768211455) << 128), x9₁) in
-    expr_let x12 := ADDC_256 (x11₂, x5, x10₁) in
+    expr_let x9 := ADD_256 ((uint256)(((uint128)(x6) & 340282366920938463463374607431768211455) << 128), x8) in
+    expr_let x10 := ADDC_256 (x9₂, x5, (uint128)(x7 >> 128)) in
+    expr_let x11 := ADD_256 ((uint256)(((uint128)(x7) & 340282366920938463463374607431768211455) << 128), x9₁) in
+    expr_let x12 := ADDC_256 (x11₂, (uint128)(x6 >> 128), x10₁) in
     expr_let x13 := ADD_256 (x11₁, x₁) in
     expr_let x14 := ADDC_256 (x13₂, x12₁, x₂) in
     expr_let x15 := SELC (x14₂, 0, 115792089210356248762697446949407573530086143415290314195533631308867097853951) in
@@ -10873,22 +10880,22 @@ montred256 = fun var : type -> Type => (λ x : var (type.type_primitive type.Z *
   (*
    mulhl@(y0, RegPInv, $x₁);
    mulll@(y1, RegPInv, $x₁);
-   add@(y2, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y0})), $y1);
-   add@(y3, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y})), $y2);
+   add@(y2, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y})), $y1);
+   add@(y3, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y0})), $y2);
    mulhh@(y4, RegMod, $y3);
    mullh@(y5, RegMod, $y3);
    mulhl@(y6, RegMod, $y3);
    mulll@(y7, RegMod, $y3);
-   add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y6})), $y7);
-   addc@(y9, carry{$y8}, Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y5)), Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y6)));
-   add@(y10, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y5})), $y8);
-   addc@(y11, carry{$y10}, $y4, $y9);
+   add@(y8, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y5})), $y7);
+   addc@(y9, carry{$y8}, $y4, Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y6)));
+   add@(y10, Straightline.expr.Cast uint256 (Straightline.expr.Shiftl 128 (Lower{$y6})), $y8);
+   addc@(y11, carry{$y10}, Straightline.expr.Cast uint128 (Straightline.expr.Shiftr 128 ($y5)), $y9);
    add@(y12, $y10, $x₁);
    addc@(y13, carry{$y12}, $y11, $x₂);
    selc@(y14, carry{$y13}, RegZero, RegMod);
    sub@(y15, $y13, $y14);
    addm@(y16, $y15, RegZero, RegMod);
-   Ret $y16
+   Ret $y16                  
   *)
 End Montgomery256.
 
@@ -11035,10 +11042,10 @@ c.Mul128x128($x3, c.Upper(RegMuLow), c.Upper($x2));
 c.Mul128x128($x4, c.Upper(RegMuLow), c.Lower($x2));
 c.Mul128x128($x5, c.Lower(RegMuLow), c.Upper($x2));
 c.Mul128x128($x6, c.Lower(RegMuLow), c.Lower($x2));
-c.Add256($x7, (c.Lower($x5) << 128), $x6);
-c.Addc128($x8, $x7_hi, c.Upper($x4), c.Upper($x5));
-c.Add256($x9, (c.Lower($x4) << 128), $x7_lo);
-c.Addc256($x10, $x9_hi, $x3, $x8_lo);
+c.Add256($x7, (c.Lower($x4) << 128), $x6);
+c.Addc256($x8, $x7_hi, $x3, c.Upper($x5));
+c.Add256($x9, (c.Lower($x5) << 128), $x7_lo);
+c.Addc256($x10, $x9_hi, c.Upper($x4), $x8_lo);
 c.Add256($x11, $x2, $x10_lo);
 c.Addc128($x12, $x11_hi, RegZero, $x1);
 c.Add256($x13, $x0, $x11_lo);
@@ -11048,10 +11055,10 @@ c.Mul128x128($x16, c.Upper(RegMod), c.Upper($x15));
 c.Mul128x128($x17, c.Lower(RegMod), c.Upper($x15));
 c.Mul128x128($x18, c.Upper(RegMod), c.Lower($x15));
 c.Mul128x128($x19, c.Lower(RegMod), c.Lower($x15));
-c.Add256($x20, (c.Lower($x18) << 128), $x19);
-c.Addc256($x21, $x20_hi, c.Upper($x17), c.Upper($x18));
-c.Add256($x22, (c.Lower($x17) << 128), $x20_lo);
-c.Addc256($x23, $x22_hi, $x16, $x21_lo);
+c.Add256($x20, (c.Lower($x17) << 128), $x19);
+c.Addc256($x21, $x20_hi, $x16, c.Upper($x18));
+c.Add256($x22, (c.Lower($x18) << 128), $x20_lo);
+c.Addc256($x23, $x22_hi, c.Upper($x17), $x21_lo);
 expr_let x24 := Z.add_get_carry_concrete
                   115792089237316195423570985008687907853269984665640564039457584007913129639936 @@
                 (Z.opp @@ $x22_lo, $x_lo) in
@@ -11068,19 +11075,19 @@ Print Montgomery256.montred256.
 c.Mul128x128($x0, c.Lower(RegPinv), c.Upper($x_lo));
 c.Mul128x128($x1, c.Upper(RegPinv), c.Lower($x_lo));
 c.Mul128x128($x2, c.Lower(RegPinv), c.Lower($x_lo));
-c.Add256($x3, (c.Lower($x1) << 128), $x2);
-c.Add256($x4, (c.Lower($x0) << 128), $x3_lo);
+c.Add256($x3, (c.Lower($x0) << 128), $x2);
+c.Add256($x4, (c.Lower($x1) << 128), $x3_lo);
 c.Mul128x128($x5, c.Upper(RegMod), c.Upper($x4_lo));
 c.Mul128x128($x6, c.Lower(RegMod), c.Upper($x4_lo));
 c.Mul128x128($x7, c.Upper(RegMod), c.Lower($x4_lo));
 c.Mul128x128($x8, c.Lower(RegMod), c.Lower($x4_lo));
-c.Add256($x9, (c.Lower($x7) << 128), $x8);
-c.Addc256($x10, $x9_hi, c.Upper($x6), c.Upper($x7));
-c.Add256($x11, (c.Lower($x6) << 128), $x9_lo);
-c.Addc256($x12, $x11_hi, $x5, $x10_lo);
+c.Add256($x9, (c.Lower($x6) << 128), $x8);
+c.Addc256($x10, $x9_hi, $x5, c.Upper($x7));
+c.Add256($x11, (c.Lower($x7) << 128), $x9_lo);
+c.Addc256($x12, $x11_hi, c.Upper($x6), $x10_lo);
 c.Add256($x13, $x11_lo, $x_lo);
 c.Addc256($x14, $x13_hi, $x12_lo, $x_hi);
 c.Selc($x15, $x14_hi, RegZero, RegMod);
 c.Sub($x16, $x14_lo, $x15);
 c.AddM($ret, $x16_lo, RegZero, RegMod);
- *)
+ *)
\ No newline at end of file
-- 
cgit v1.2.3