From c17c32051bc4d481e69b6f9e9b0275a9ccc46db7 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 31 May 2018 19:10:19 +0200
Subject: prove prefancy->fancy for barrett special case

---
 src/Experiments/SimplyTypedArithmetic.v | 317 ++++++++++++++++++++++++++------
 1 file changed, 256 insertions(+), 61 deletions(-)

(limited to 'src/Experiments/SimplyTypedArithmetic.v')

diff --git a/src/Experiments/SimplyTypedArithmetic.v b/src/Experiments/SimplyTypedArithmetic.v
index d8c16258d..441c111dd 100644
--- a/src/Experiments/SimplyTypedArithmetic.v
+++ b/src/Experiments/SimplyTypedArithmetic.v
@@ -8560,7 +8560,7 @@ Module PreFancy.
     Qed.
 
     Let half_bits := log2wordmax / 2.
- 
+
     Lemma half_bits_nonneg : 0 <= half_bits.
     Proof. subst half_bits; Z.zero_bounds. Qed.
 
@@ -9686,7 +9686,7 @@ Module PreFancy.
 
       Definition range_ok {t} : range_type t -> Prop :=
         match t with
-        | type.type_primitive type.Z => fun r => in_word_range r 
+        | type.type_primitive type.Z => fun r => in_word_range r
         | type.prod type.Z type.Z => fun r => in_word_range (fst r) /\ snd r = flag_range
         | _ => fun _ => False
         end.
@@ -9908,7 +9908,7 @@ Module PreFancy.
     Notation "'ret' $ x" := (Scalar (Var tZ x)) (at level 10, format "'ret'  $ x").
     Notation "( x , y )" := (Pair x y) (at level 10, left associativity).
   End Notations.
- 
+
   Module Tactics.
     Ltac ok_expr_step' :=
       match goal with
@@ -10112,12 +10112,17 @@ Module Fancy.
                    (upper128 r0) * (upper128 r1))
       |}.
 
+    (* Note : Unlike the other operations, the output of RSHI is
+    truncated in the specification. This is not strictly necessary,
+    since the interpretation function truncates the output
+    anyway. However, it is useful to make the definition line up
+    exactly with Z.rshi. *)
     Definition RSHI (imm : int) : instruction :=
       {|
         num_source_regs := 2;
         writes_conditions := [M; L; Z];
         spec := (fun '(r0, r1) cc =>
-                   (r0 + (r1 << 256)) >> imm)
+                   (((2^256 * r0) + r1) >> imm) mod (2^256))
       |}.
 
     Definition SELC : instruction :=
@@ -11472,10 +11477,197 @@ Module Barrett256.
 
   Admitted.
 
+  Import Fancy_PreFancy_Equiv.
+
+  Definition interp_equivZZ_256 {s} :=
+    @interp_equivZZ s 256 ltac:(cbv; congruence) 115792089237316195423570985008687907853269984665640564039457584007913129639935 ltac:(reflexivity).
+  Definition interp_equivZ_256 {s} :=
+    @interp_equivZ s 256 115792089237316195423570985008687907853269984665640564039457584007913129639935 ltac:(reflexivity).
+
+  Local Ltac simplify_op_equiv start_ctx :=
+    cbn - [Fancy.spec PreFancy.interp_ident Fancy.cc_spec Z.shiftl];
+    repeat match goal with H : start_ctx _ = _ |- _ => rewrite H end;
+    cbv - [
+      Z.rshi Z.cc_m Fancy.CC.cc_m
+        Z.add_with_get_carry_full Z.add_get_carry_full
+        Z.sub_get_borrow_full Z.sub_with_get_borrow_full
+        Z.le Z.lt Z.ltb Z.leb Z.geb Z.eqb Z.land Z.shiftr Z.shiftl
+        Z.add Z.mul Z.div Z.sub Z.modulo Z.testbit Z.pow Z.ones
+        fst snd]; cbn [fst snd];
+    try (replace (2 ^ (256 / 2) - 1) with (Z.ones 128) by reflexivity; rewrite !Z.land_ones by omega);
+    autorewrite with to_div_mod; rewrite ?Z.mod_mod, <-?Z.testbit_spec' by omega;
+    let r := (eval compute in (2 ^ 256)) in
+    replace (2^256) with r in * by reflexivity;
+    repeat match goal with
+           | H : 0 <= ?x < ?m |- context [?x mod ?m] => rewrite (Z.mod_small x m) by apply H
+           | |- context [?x <? 0] => rewrite (proj2 (Z.ltb_ge x 0)) by (break_match; Z.zero_bounds)
+           | _ => rewrite Z.mod_small with (b:=2) by (break_match; omega)
+           | |- context [ (if Z.testbit ?a ?n then 1 else 0) + ?b + ?c] =>
+             replace ((if Z.testbit a n then 1 else 0) + b + c) with (b + c + (if Z.testbit a n then 1 else 0)) by ring
+           end.
+
+  Local Ltac solve_nonneg ctx :=
+    match goal with x := (Fancy.spec _ _ _) |- _ => subst x end;
+    simplify_op_equiv ctx; Z.zero_bounds.
+
+  Local Ltac generalize_result :=
+    let v := fresh "v" in intro v; generalize v; clear v; intro v.
+
+  Local Ltac generalize_result_nonneg ctx :=
+    let v := fresh "v" in
+    let v_nonneg := fresh "v_nonneg" in
+    intro v; assert (0 <= v) as v_nonneg; [solve_nonneg ctx |generalize v v_nonneg; clear v v_nonneg; intros v v_nonneg].
+
+  Local Ltac step ctx :=
+    match goal with
+    | |- Fancy.interp _ _ _ (Fancy.Instr (Fancy.ADD _) _ _ (Fancy.Instr (Fancy.ADDC _) _ _ _)) _ _ = _ =>
+      apply interp_equivZZ_256; [ simplify_op_equiv ctx | simplify_op_equiv ctx | generalize_result_nonneg ctx]
+    | _ => apply interp_equivZ_256; [simplify_op_equiv ctx | generalize_result]
+    | _ => apply interp_equivZZ_256; [ simplify_op_equiv ctx | simplify_op_equiv ctx | generalize_result]
+    end.
+
+  (* TODO: move this lemma to ZUtil *)
+  Lemma testbit_neg_eq_if x n :
+    0 <= n ->
+    - (2 ^ n) <= x  < 2 ^ n ->
+    Z.b2z (if x <? 0 then true else Z.testbit x n) = - (x / 2 ^ n) mod 2.
+  Proof.
+    intros. break_match; Z.ltb_to_lt.
+    { autorewrite with zsimplify. reflexivity. }
+    { autorewrite with zsimplify.
+      rewrite Z.bits_above_pow2 by omega.
+      reflexivity. }
+  Qed.
+
+  Lemma prod_barrett_red256_correct :
+    forall (cc_start_state : Fancy.CC.state) (* starting carry flags *)
+           (start_context : register -> Z)   (* starting register values *)
+           (x xHigh RegMuLow scratchp1 scratchp2 scratchp3 scratchp4 scratchp5 extra_reg : register), (* registers to use in computation *)
+      NoDup [x; xHigh; RegMuLow; scratchp1; scratchp2; scratchp3; scratchp4; scratchp5; extra_reg; RegMod; RegZero] -> (* registers are unique *)
+             0 <= start_context x < 2^machine_wordsize ->
+             0 <= start_context xHigh < M ->
+             start_context RegMuLow = muLow ->
+             start_context RegMod = M ->
+             start_context RegZero = 0 ->
+             cc_start_state.(Fancy.CC.cc_m) = (Z.cc_m (2^256) (start_context xHigh) =? 1) ->
+             let X := start_context x + 2^machine_wordsize * start_context xHigh in
+             ProdEquiv.interp256 (Prod.MulMod x xHigh RegMuLow scratchp1 scratchp2 scratchp3 scratchp4 scratchp5) cc_start_state start_context = X mod M.
+  Proof.
+    intros. subst X.
+    assert (0 <= start_context xHigh < 2^machine_wordsize) by (cbv [M] in *; cbn; omega).
+    let r := (eval compute in (2 ^ machine_wordsize)) in
+    replace (2^machine_wordsize) with r in * by reflexivity.
+    cbv [M muLow] in *.
+
+    rewrite <-barrett_red256_prefancy_correct with (dummy_arrow := fun s d _ => DefaultValue.type.default) by auto.
+    rewrite <-barrett_red256_alloc_equivalent with (errorR := RegZero) (errorP := 1%positive) (extra_reg:=extra_reg)
+      by (cbn in *; auto with omega).
+    cbv [ProdEquiv.interp256].
+    let r := (eval compute in (2 ^ 256)) in
+    replace (2^256) with r in * by reflexivity.
+    cbv [barrett_red256_alloc barrett_red256_prefancy].
+
+    step start_context.
+    {
+      match goal with H : Fancy.CC.cc_m _ = _ |- _ => rewrite H end.
+      match goal with |- context [Z.cc_m ?s ?x] =>
+                      pose proof (Z.cc_m_small s x ltac:(reflexivity) ltac:(reflexivity) ltac:(omega));
+                        let H := fresh in
+                        assert (Z.cc_m s x = 1 \/ Z.cc_m s x = 0) as H by omega;
+                          destruct H as [H | H]; rewrite H in *
+      end; break_innermost_match; Z.ltb_to_lt; try congruence. }
+    apply interp_equivZ_256; [ simplify_op_equiv start_context | ]. (* apply manually instead of using [step] to allow a custom bounds proof *)
+    { rewrite Z.rshi_correct by omega.
+      autorewrite with zsimplify_fast.
+      rewrite Z.shiftr_div_pow2 by omega.
+      reflexivity. }
+
+    (* Special case to remember the bound for the output of RSHI *)
+    let v := fresh "v" in
+    let v_bound := fresh "v_bound" in
+    intro v; assert (0 <= v <= 1) as v_bound; [ |generalize v v_bound; clear v v_bound; intros v v_bound].
+    { solve_nonneg start_context. autorewrite with zsimplify_fast.
+      rewrite Z.shiftr_div_pow2 by omega.
+      rewrite Z.mod_pull_div by omega.
+      rewrite Z.mod_small by (cbn; omega).
+      split; [Z.zero_bounds|].
+      apply Z.lt_succ_r.
+      apply Z.div_lt_upper_bound; cbn; omega. }
+
+    step start_context.
+    { rewrite Z.rshi_correct by omega.
+      rewrite Z.shiftr_div_pow2 by omega.
+      repeat (f_equal; try ring). }
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context;
+      [ rewrite Z.mod_small with (b:=2) by (rewrite Z.mod_small by omega; omega); (* Here we make use of the bound of RSHI *)
+        reflexivity
+      | rewrite Z.mod_small with (b:=2) by (rewrite Z.mod_small by omega; omega); (* Here we make use of the bound of RSHI *)
+        reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context.
+    { rewrite Z.rshi_correct by omega.
+      rewrite Z.shiftr_div_pow2 by omega.
+      repeat (f_equal; try ring). }
+
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+    step start_context; [ reflexivity | reflexivity | ].
+
+    step start_context.
+    { reflexivity. }
+    { autorewrite with zsimplify_fast.
+      match goal with |- context [?x mod ?m] => pose proof (Z.mod_pos_bound x m ltac:(omega)) end.
+      rewrite <-testbit_neg_eq_if with (n:=256) by (cbn; omega).
+      reflexivity. }
+    step start_context.
+    { reflexivity. }
+    { autorewrite with zsimplify_fast.
+      rewrite Z.mod_small with (a:=(if (if _ <? 0 then true else _) then _ else _)) (b:=2) by (break_innermost_match; omega).
+      match goal with |- context [?a - ?b - ?c] => replace (a - b - c) with (a - (b + c)) by ring end.
+      match goal with |- context [?x mod ?m] => pose proof (Z.mod_pos_bound x m ltac:(omega)) end.
+      rewrite <-testbit_neg_eq_if with (n:=256) by (break_innermost_match; cbn; omega).
+      reflexivity. }
+    step start_context.
+    { rewrite Z.bit0_eqb.
+      match goal with |- context [(?x mod ?m) &' 1] =>
+                      replace (x mod m) with (x &' Z.ones 256) by (rewrite Z.land_ones by omega; reflexivity) end.
+      rewrite <-Z.land_assoc.
+      rewrite Z.land_ones with (n:=1) by omega.
+      cbn.
+      match goal with |- context [?x mod 2] =>
+                      let H := fresh in
+                      assert (x mod 2 = 0 \/ x mod 2 = 1) as H
+                          by (pose proof (Z.mod_pos_bound x 2 ltac:(omega)); omega);
+                        destruct H as [H | H]; rewrite H
+      end; reflexivity. }
+    step start_context.
+    { reflexivity. }
+    { autorewrite with zsimplify_fast.
+      repeat match goal with |- context [?x mod ?m] => unique pose proof (Z.mod_pos_bound x m ltac:(omega)) end.
+      rewrite <-testbit_neg_eq_if with (n:=256) by (cbn; omega).
+      reflexivity. }
+    step start_context; [ break_innermost_match; Z.ltb_to_lt; omega | ].
+    reflexivity.
+  Qed.
+
   Import PrintingNotations.
   Set Printing Width 1000.
   Open Scope expr_scope.
-
   Print barrett_red256.
   (*
 barrett_red256 =  fun var : type -> Type => λ x : var (type.type_primitive type.Z * type.type_primitive type.Z)%ctype,
@@ -11546,7 +11738,7 @@ barrett_red256 =  fun var : type -> Type => λ x : var (type.type_primitive type
     sell@(y25, $y24, RegZero, RegMod);
     sub@(y26, $y23, $y25, 0);
     addm@(y27, $y26, RegZero, RegMod);
-    ret $y27 
+    ret $y27
   *)
 End Barrett256.
 
@@ -12533,71 +12725,67 @@ Eval cbv beta iota delta [Prod.MontRed256 Prod.Mul256 Prod.Mul256x256] in Prod.M
 (* Uncomment to see proof statement and remaining admitted statements,
 or search for "prod_montred256_correct" to see comments on the proof
 preconditions. *)
-(* 
+(*
 Check Montgomery256.prod_montred256_correct.
 Print Assumptions Montgomery256.prod_montred256_correct.
 *)
 
 (*** Barrett Reduction ***)
 
-(* Status : Code in "pre-fancy" (stage before final form) is proven
-correct modulo admits in compiler portions. Code in final form
-("fancy") is generated, but not yet proven equivalent to the
-"pre-fancy" version.
-
-The next step is to, using the same methodology as for Montgomery,
-prove the generated fancy code equivalent to Prod.MulMod and also
-equivalent to the "pre-fancy" code. Once these equivalences are
-proven, then the "pre-fancy" proof will apply to Prod.MulMod.
- *)
-
-Import PreFancy.
-Import PreFancy.Notations.
-Local Notation "'RegMod'" := (Straightline.expr.Primitive (t:=type.Z) 115792089210356248762697446949407573530086143415290314195533631308867097853951).
-Local Notation "'RegMuLow'" := (Straightline.expr.Primitive (t:=type.Z) 26959946667150639793205513449348445388433292963828203772348655992835).
-
-(* "Prefancy" form : *)
-Print Barrett256.barrett_red256_prefancy.
-(*
-selm@(y, $x₂, RegZero, RegMuLow);
-rshi@(y0, RegZero, $x₂,255);
-rshi@(y1, $x₂, $x₁,255);
-mulhh@(y2, RegMuLow, $y1);
-mulhl@(y3, RegMuLow, $y1);
-mullh@(y4, RegMuLow, $y1);
-mulll@(y5, RegMuLow, $y1);
-add@(y6, $y5, $y4, 128);
-addc@(y7, carry{$y6}, $y2, $y4, -128);
-add@(y8, $y6, $y3, 128);
-addc@(y9, carry{$y8}, $y7, $y3, -128);
-add@(y10, $y1, $y9, 0);
-addc@(y11, carry{$y10}, RegZero, $y0, 0);
-add@(y12, $y, $y10, 0);
-addc@(y13, carry{$y12}, RegZero, $y11, 0);
-rshi@(y14, $y13, $y12,1);
-mulhh@(y15, RegMod, $y14);
-mullh@(y16, RegMod, $y14);
-mulhl@(y17, RegMod, $y14);
-mulll@(y18, RegMod, $y14);
-add@(y19, $y18, $y17, 128);
-addc@(y20, carry{$y19}, $y15, $y17, -128);
-add@(y21, $y19, $y16, 128);
-addc@(y22, carry{$y21}, $y20, $y16, -128);
-sub@(y23, $x₁, $y21, 0);
-subb@(y24, carry{$y23}, $x₂, $y22, 0);
-sell@(y25, $y24, RegZero, RegMod);
-sub@(y26, $y23, $y25, 0);
-addm@(y27, $y26, RegZero, RegMod);
-ret $y27
- *)
+(* Status: Code is proven correct modulo admits in compiler
+portions. However, unlike for Montgomery, this code is not proven
+equivalent to the register-allocated and efficiently-scheduled
+reference (Prod.MulMod). This proof is currently admitted and would
+require either fiddling with code generation to make instructions come
+out in the right order or reasoning about which instructions
+commute. *)
 
-(* Uncomment to see proof statement and remaining admitted statements. *)
+(* Barrett reference code: *)
+Eval cbv beta iota delta [Prod.MulMod Prod.Mul256x256] in Prod.MulMod.
 (*
-Check barrett_red256_prefancy_correct.
-Print Assumptions barrett_red256_prefancy_correct.
+     = fun x xHigh RegMuLow scratchp1 scratchp2 scratchp3 scratchp4 scratchp5 : register =>
+       let q1Bottom256 := scratchp1 in
+       let muSelect := scratchp2 in
+       let q2 := scratchp3 in
+       let q2High := scratchp4 in
+       let q2High2 := scratchp5 in
+       let q3 := scratchp1 in
+       let r2 := scratchp2 in
+       let r2High := scratchp3 in
+       let maybeM := scratchp1 in
+       SELM muSelect RegMuLow RegZero;
+       RSHI 255 q1Bottom256 xHigh x;
+       MUL128LL q2 q1Bottom256 RegMuLow;
+       MUL128UU q2High q1Bottom256 RegMuLow;
+       MUL128UL scratchp5 q1Bottom256 RegMuLow;
+       ADD 128 q2 q2 scratchp5;
+       ADDC (-128) q2High q2High scratchp5;
+       MUL128LU scratchp5 q1Bottom256 RegMuLow;
+       ADD 128 q2 q2 scratchp5;
+       ADDC (-128) q2High q2High scratchp5;
+       RSHI 255 q2High2 RegZero xHigh;
+       ADD 0 q2High q2High q1Bottom256;
+       ADDC 0 q2High2 q2High2 RegZero;
+       ADD 0 q2High q2High muSelect;
+       ADDC 0 q2High2 q2High2 RegZero;
+       RSHI 1 q3 q2High2 q2High;
+       MUL128LL r2 RegMod q3;
+       MUL128UU r2High RegMod q3;
+       MUL128UL scratchp4 RegMod q3;
+       ADD 128 r2 r2 scratchp4;
+       ADDC (-128) r2High r2High scratchp4;
+       MUL128LU scratchp4 RegMod q3;
+       ADD 128 r2 r2 scratchp4;
+       ADDC (-128) r2High r2High scratchp4;
+       SUB 0 muSelect x r2;
+       SUBC 0 xHigh xHigh r2High;
+       SELL maybeM RegMod RegZero;
+       SUB 0 q3 muSelect maybeM;
+       ADDM x q3 RegZero RegMod;
+       Ret x
  *)
 
-(* "Fancy" code (NOT proven) *)
+(* Barrett generated code (equivalence with reference admitted) *)
 Eval cbv beta iota delta [barrett_red256_alloc] in barrett_red256_alloc.
 (*
      = fun (xLow xHigh RegMuLow : register) (_ : positive) (_ : register) =>
@@ -12632,3 +12820,10 @@ Eval cbv beta iota delta [barrett_red256_alloc] in barrett_red256_alloc.
        ADDM r28 r27 RegZero RegMod;
        Ret r28
  *)
+
+(* Uncomment to see proof statement and remaining admitted statements. *)
+(*
+Check prod_barrett_red256_correct.
+Print Assumptions prod_barrett_red256_correct.
+(* The equivalence with generated code is admitted as barrett_red256_alloc_equivalent. *)
+*)
\ No newline at end of file
-- 
cgit v1.2.3