(*** Word-By-Word Montgomery Multiplication Proofs *)
Require Import Coq.Arith.Arith.
Require Import Coq.ZArith.BinInt Coq.ZArith.ZArith Coq.ZArith.Zdiv Coq.micromega.Lia.
Require Import Crypto.Util.LetIn.
Require Import Crypto.Util.Prod.
Require Import Crypto.Util.NatUtil.
Require Import Crypto.Util.ZUtil.
Require Import Crypto.Arithmetic.ModularArithmeticTheorems Crypto.Spec.ModularArithmetic.
Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Abstract.Definition.
Require Import Crypto.Algebra.Ring.
Require Import Crypto.Util.ZUtil.MulSplit.
Require Import Crypto.Util.Sigma.
Require Import Crypto.Util.Tactics.SetEvars.
Require Import Crypto.Util.Tactics.SubstEvars.
Require Import Crypto.Util.Tactics.DestructHead.
Local Open Scope Z_scope.

Section WordByWordMontgomery.
  Context
    {T : Type}
    {eval : T -> Z}
    {numlimbs : T -> nat}
    {zero : nat -> T}
    {divmod : T -> T * Z} (* returns lowest limb and all-but-lowest-limb *)
    {r : positive}
    {r_big : r > 1}
    {R : positive}
    {R_numlimbs : nat}
    {R_correct : R = r^Z.of_nat R_numlimbs :> Z}
    {small : T -> Prop}
    {eval_zero : forall n, eval (zero n) = 0}
    {numlimbs_zero : forall n, numlimbs (zero n) = n}
    {eval_div : forall v, small v -> eval (fst (divmod v)) = eval v / r}
    {eval_mod : forall v, small v -> snd (divmod v) = eval v mod r}
    {small_div : forall v, small v -> small (fst (divmod v))}
    {numlimbs_div : forall v, numlimbs (fst (divmod v)) = pred (numlimbs v)}
    {scmul : Z -> T -> T} (* uses double-output multiply *)
    {eval_scmul: forall a v, 0 <= a < r -> 0 <= eval v < R -> eval (scmul a v) = a * eval v}
    {numlimbs_scmul : forall a v, 0 <= a < r -> numlimbs (scmul a v) = S (numlimbs v)}
    {add : T -> T -> T} (* joins carry *)
    {eval_add : forall a b, eval (add a b) = eval a + eval b}
    {small_add : forall a b, small (add a b)}
    {numlimbs_add : forall a b, numlimbs (add a b) = Datatypes.S (max (numlimbs a) (numlimbs b))}
    {drop_high : T -> T} (* drops things after [S R_numlimbs] *)
    {eval_drop_high : forall v, small v -> eval (drop_high v) = eval v mod (r * r^Z.of_nat R_numlimbs)}
    {numlimbs_drop_high : forall v, numlimbs (drop_high v) = min (numlimbs v) (S R_numlimbs)}
    (N : T) (Npos : positive) (Npos_correct: eval N = Z.pos Npos)
    (N_lt_R : eval N < R)
    (B : T)
    (B_bounds : 0 <= eval B < R)
    ri (ri_correct : r*ri mod (eval N) = 1 mod (eval N)).
  Context (k : Z) (k_correct : k * eval N mod r = (-1) mod r).

  Create HintDb push_numlimbs discriminated.
  Create HintDb push_eval discriminated.
  Local Ltac t_small :=
    repeat first [ assumption
                 | apply small_add
                 | apply small_div
                 | apply Z_mod_lt
                 | rewrite Z.mul_split_mod
                 | solve [ auto with zarith ]
                 | lia
                 | progress autorewrite with push_eval
                 | progress autorewrite with push_numlimbs ].
  Hint Rewrite
       eval_zero
       eval_div
       eval_mod
       eval_add
       eval_scmul
       eval_drop_high
       using (repeat autounfold with word_by_word_montgomery; t_small)
    : push_eval.
  Hint Rewrite
       numlimbs_zero
       numlimbs_div
       numlimbs_add
       numlimbs_scmul
       numlimbs_drop_high
       using (repeat autounfold with word_by_word_montgomery; t_small)
    : push_numlimbs.
  Hint Rewrite <- Max.succ_max_distr pred_Sn Min.succ_min_distr : push_numlimbs.


  (* Recurse for a as many iterations as A has limbs, varying A := A, S := 0, r, bounds *)
  Section Iteration.
    Context (A S : T)
            (small_A : small A)
            (S_nonneg : 0 <= eval S).
    (* Given A, B < R, we want to compute A * B / R mod N. R = bound 0 * ... * bound (n-1) *)

    Local Coercion eval : T >-> Z.

    Local Notation a := (@WordByWord.Abstract.Definition.a T divmod A).
    Local Notation A' := (@WordByWord.Abstract.Definition.A' T divmod A).
    Local Notation S1 := (@WordByWord.Abstract.Definition.S1 T divmod scmul add B A S).
    Local Notation S2 := (@WordByWord.Abstract.Definition.S2 T divmod r scmul add N B k A S).
    Local Notation S3 := (@WordByWord.Abstract.Definition.S3 T divmod r scmul add N B k A S).
    Local Notation S4 := (@WordByWord.Abstract.Definition.S4 T divmod r scmul add drop_high N B k A S).

    Lemma S3_bound
      : eval S < eval N + eval B
        -> eval S3 < eval N + eval B.
    Proof.
      assert (Hmod : forall a b, 0 < b -> a mod b <= b - 1)
        by (intros x y; pose proof (Z_mod_lt x y); omega).
      intro HS.
      unfold S3, WordByWord.Abstract.Definition.S2, WordByWord.Abstract.Definition.S1.
      autorewrite with push_eval; [].
      eapply Z.le_lt_trans.
      { transitivity ((N+B-1 + (r-1)*B + (r-1)*N) / r);
          [ | set_evars; ring_simplify_subterms; subst_evars; reflexivity ].
        Z.peel_le; repeat apply Z.add_le_mono; repeat apply Z.mul_le_mono_nonneg; try lia;
          repeat autounfold with word_by_word_montgomery; rewrite ?Z.mul_split_mod;
          autorewrite with push_eval;
            try Z.zero_bounds;
            auto with lia. }
      rewrite (Z.mul_comm _ r), <- Z.add_sub_assoc, <- Z.add_opp_r, !Z.div_add_l' by lia.
      autorewrite with zsimplify.
      omega.
    Qed.

    Lemma small_A'
      : small A'.
    Proof.
      repeat autounfold with word_by_word_montgomery; auto.
    Qed.

    Lemma small_S3
      : small S3.
    Proof. repeat autounfold with word_by_word_montgomery; t_small. Qed.

    Lemma S3_nonneg : 0 <= eval S3.
    Proof.
      repeat autounfold with word_by_word_montgomery; rewrite Z.mul_split_mod;
        autorewrite with push_eval; [].
      rewrite ?Npos_correct; Z.zero_bounds; lia.
    Qed.

    Lemma S4_nonneg : 0 <= eval S4.
    Proof. unfold S4; rewrite eval_drop_high by apply small_S3; Z.zero_bounds. Qed.

    Lemma S4_bound
      : eval S < eval N + eval B
        -> eval S4 < eval N + eval B.
    Proof.
      intro H; pose proof (S3_bound H); pose proof S3_nonneg.
      unfold S4.
      rewrite eval_drop_high by apply small_S3.
      rewrite Z.mod_small by nia.
      assumption.
    Qed.

    Lemma numlimbs_S4 : numlimbs S4 = min (max (1 + numlimbs S) (1 + max (1 + numlimbs B) (numlimbs N))) (1 + R_numlimbs).
    Proof.
      cbn [plus].
      repeat autounfold with word_by_word_montgomery; rewrite Z.mul_split_mod.
      repeat autorewrite with push_numlimbs.
      change Init.Nat.max with Nat.max.
      rewrite <- ?(Max.max_assoc (numlimbs S)).
      reflexivity.
    Qed.

    Lemma S1_eq : eval S1 = S + a*B.
    Proof.
      cbv [S1 a WordByWord.Abstract.Definition.A'].
      repeat autorewrite with push_eval.
      reflexivity.
    Qed.

    Lemma S2_mod_N : (eval S2) mod N = (S + a*B) mod N.
    Proof.
      cbv [S2 WordByWord.Abstract.Definition.q WordByWord.Abstract.Definition.s]; autorewrite with push_eval zsimplify. rewrite S1_eq. reflexivity.
    Qed.

    Lemma S2_mod_r : S2 mod r = 0.
      cbv [S2 WordByWord.Abstract.Definition.q WordByWord.Abstract.Definition.s]; autorewrite with push_eval.
      assert (r > 0) by lia.
      assert (Hr : (-(1 mod r)) mod r = r - 1 /\ (-(1)) mod r = r - 1).
      { destruct (Z.eq_dec r 1) as [H'|H'].
        { rewrite H'; split; reflexivity. }
        { rewrite !Z_mod_nz_opp_full; rewrite ?Z.mod_mod; Z.rewrite_mod_small; [ split; reflexivity | omega.. ]. } }
      autorewrite with pull_Zmod.
      replace 0 with (0 mod r) by apply Zmod_0_l.
      eapply F.eq_of_Z_iff.
      rewrite Z.mul_split_mod.
      repeat rewrite ?F.of_Z_add, ?F.of_Z_mul, <-?F.of_Z_mod.
      rewrite <-Algebra.Hierarchy.associative.
      replace ((F.of_Z r k * F.of_Z r (eval N))%F) with (F.opp (m:=r) F.one).
      { cbv [F.of_Z F.add]; simpl.
        apply path_sig_hprop; [ intro; exact HProp.allpath_hprop | ].
        simpl.
        rewrite (proj1 Hr), Z.mul_sub_distr_l.
        push_Zmod; pull_Zmod.
        autorewrite with zsimplify; reflexivity. }
      { rewrite <- F.of_Z_mul.
        rewrite F.of_Z_mod.
        rewrite k_correct.
        cbv [F.of_Z F.add F.opp F.one]; simpl.
        change (-(1)) with (-1) in *.
        apply path_sig_hprop; [ intro; exact HProp.allpath_hprop | ]; simpl.
        rewrite (proj1 Hr), (proj2 Hr); Z.rewrite_mod_small; reflexivity. }
    Qed.

    Lemma S3_mod_N
      : S3 mod N = (S + a*B)*ri mod N.
    Proof.
      cbv [S3]; autorewrite with push_eval cancel_pair.
      pose proof fun a => Z.div_to_inv_modulo N a r ri eq_refl ri_correct as HH;
                            cbv [Z.equiv_modulo] in HH; rewrite HH; clear HH.
      etransitivity; [rewrite (fun a => Z.mul_mod_l a ri N)|
                      rewrite (fun a => Z.mul_mod_l a ri N); reflexivity].
      rewrite <-S2_mod_N; repeat (f_equal; []); autorewrite with push_eval.
      autorewrite with push_Zmod;
        rewrite S2_mod_r;
        autorewrite with zsimplify.
      reflexivity.
    Qed.

    Lemma S4_mod_N
          (Hbound : eval S < eval N + eval B)
      : S4 mod N = (S + a*B)*ri mod N.
    Proof.
      pose proof (S3_bound Hbound); pose proof S3_nonneg.
      unfold S4; autorewrite with push_eval.
      rewrite (Z.mod_small _ (r * _)) by nia.
      apply S3_mod_N.
    Qed.
  End Iteration.

  Local Notation redc_body := (@redc_body T divmod r scmul add drop_high N B k).
  Local Notation redc_loop := (@redc_loop T divmod r scmul add drop_high N B k).
  Local Notation redc A := (@redc T numlimbs zero divmod r scmul add drop_high N A B k).

  Lemma redc_loop_comm_body count
    : forall A_S, redc_loop count (redc_body A_S) = redc_body (redc_loop count A_S).
  Proof.
    induction count as [|count IHcount]; try reflexivity.
    simpl; intro; rewrite IHcount; reflexivity.
  Qed.

  Section body.
    Context (A_S : T * T).
    Let A:=fst A_S.
    Let S:=snd A_S.
    Let A_a:=divmod A.
    Let a:=snd A_a.
    Context (small_A : small A)
            (S_bound : 0 <= eval S < eval N + eval B).

    Lemma small_fst_redc_body : small (fst (redc_body A_S)).
    Proof. destruct A_S; apply small_A'; assumption. Qed.
    Lemma snd_redc_body_nonneg : 0 <= eval (snd (redc_body A_S)).
    Proof. destruct A_S; apply S4_nonneg; assumption. Qed.

    Lemma snd_redc_body_mod_N
      : (eval (snd (redc_body A_S))) mod (eval N) = (eval S + a*eval B)*ri mod (eval N).
    Proof. destruct A_S; apply S4_mod_N; auto; omega. Qed.

    Lemma fst_redc_body
      : (eval (fst (redc_body A_S))) = eval (fst A_S) / r.
    Proof.
      destruct A_S; simpl; unfold WordByWord.Abstract.Definition.A', WordByWord.Abstract.Definition.A_a, Let_In, a, A_a, A; simpl.
      autorewrite with push_eval.
      reflexivity.
    Qed.

    Lemma fst_redc_body_mod_N
      : (eval (fst (redc_body A_S))) mod (eval N) = ((eval (fst A_S) - a)*ri) mod (eval N).
    Proof.
      rewrite fst_redc_body.
      etransitivity; [ eapply Z.div_to_inv_modulo; try eassumption; lia | ].
      unfold a, A_a, A.
      autorewrite with push_eval.
      reflexivity.
    Qed.

    Lemma redc_body_bound
      : eval S < eval N + eval B
        -> eval (snd (redc_body A_S)) < eval N + eval B.
    Proof.
      destruct A_S; apply S4_bound; unfold S in *; cbn [snd] in *; try assumption; try omega.
    Qed.

    Lemma numlimbs_redc_body : numlimbs (snd (redc_body A_S))
                               = min (max (1 + numlimbs (snd A_S)) (1 + max (1 + numlimbs B) (numlimbs N))) (1 + R_numlimbs).
    Proof. destruct A_S; apply numlimbs_S4; assumption. Qed.
  End body.

  Local Arguments Z.pow !_ !_.
  Local Arguments Z.of_nat !_.
  Local Ltac induction_loop count IHcount
    := induction count as [|count IHcount]; intros; cbn [redc_loop] in *; [ | rewrite redc_loop_comm_body in * ].
  Lemma redc_loop_good A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
    : small (fst (redc_loop count A_S))
      /\ 0 <= eval (snd (redc_loop count A_S)) < eval N + eval B.
  Proof.
    induction_loop count IHcount; auto; [].
    change (id (0 <= eval B < R)) in B_bounds (* don't let [destruct_head'_and] loop *).
    destruct_head'_and.
    repeat first [ apply conj
                 | apply small_fst_redc_body
                 | apply redc_body_bound
                 | apply snd_redc_body_nonneg
                 | solve [ auto ] ].
  Qed.

  Lemma redc_loop_bound A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
    : 0 <= eval (snd (redc_loop count A_S)) < eval N + eval B.
  Proof. apply redc_loop_good; assumption. Qed.

  Local Ltac t_min_max_step _ :=
    match goal with
    | [ |- context[Init.Nat.max ?x ?y] ]
      => first [ rewrite (Max.max_l x y) by omega
               | rewrite (Max.max_r x y) by omega ]
    | [ |- context[Init.Nat.min ?x ?y] ]
      => first [ rewrite (Min.min_l x y) by omega
               | rewrite (Min.min_r x y) by omega ]
    | _ => progress change Init.Nat.max with Nat.max
    | _ => progress change Init.Nat.min with Nat.min
    end.

  Lemma numlimbs_redc_loop A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
        (Hnumlimbs : (R_numlimbs <= numlimbs (snd A_S))%nat)
    : numlimbs (snd (redc_loop count A_S))
      = match count with
        | O => numlimbs (snd A_S)
        | S _ => 1 + R_numlimbs
        end%nat.
  Proof.
    assert (Hgen
            : numlimbs (snd (redc_loop count A_S))
              = match count with
                | O => numlimbs (snd A_S)
                | S _ => min (max (count + numlimbs (snd A_S)) (1 + max (1 + numlimbs B) (numlimbs N))) (1 + R_numlimbs)
                end).
    { induction_loop count IHcount; [ reflexivity | ].
      rewrite numlimbs_redc_body by (try apply redc_loop_good; auto).
      rewrite IHcount; clear IHcount.
      destruct count; [ reflexivity | ].
      destruct (Compare_dec.le_lt_dec (1 + max (1 + numlimbs B) (numlimbs N)) (S count + numlimbs (snd A_S))),
      (Compare_dec.le_lt_dec (1 + R_numlimbs) (S count + numlimbs (snd A_S))),
      (Compare_dec.le_lt_dec (1 + R_numlimbs) (1 + max (1 + numlimbs B) (numlimbs N)));
        repeat first [ reflexivity
                     | t_min_max_step ()
                     | progress autorewrite with push_numlimbs
                     | rewrite Nat.min_comm, Nat.min_max_distr ]. }
    rewrite Hgen; clear Hgen.
    destruct count; [ reflexivity | ].
    repeat apply Max.max_case_strong; apply Min.min_case_strong; omega.
  Qed.


  Lemma fst_redc_loop A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
    : eval (fst (redc_loop count A_S)) = eval (fst A_S) / r^(Z.of_nat count).
  Proof.
    induction_loop count IHcount.
    { simpl; autorewrite with zsimplify; reflexivity. }
    { rewrite fst_redc_body, IHcount
        by (apply redc_loop_good; auto).
      rewrite Zdiv_Zdiv by Z.zero_bounds.
      rewrite <- (Z.pow_1_r r) at 2.
      rewrite <- Z.pow_add_r by lia.
      replace (Z.of_nat count + 1) with (Z.of_nat (S count)) by (simpl; lia).
      reflexivity. }
  Qed.

  Lemma fst_redc_loop_mod_N A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
    : eval (fst (redc_loop count A_S)) mod (eval N)
      = (eval (fst A_S) - eval (fst A_S) mod r^Z.of_nat count)
        * ri^(Z.of_nat count) mod (eval N).
  Proof.
    rewrite fst_redc_loop by assumption.
    destruct count.
    { simpl; autorewrite with zsimplify; reflexivity. }
    { etransitivity;
        [ eapply Z.div_to_inv_modulo;
          try solve [ eassumption
                    | apply Z.lt_gt, Z.pow_pos_nonneg; lia ]
        | ].
      { erewrite <- Z.pow_mul_l, <- Z.pow_1_l.
        { apply Z.pow_mod_Proper; [ eassumption | reflexivity ]. }
        { lia. } }
      reflexivity. }
  Qed.

  Local Arguments Z.pow : simpl never.
  Lemma snd_redc_loop_mod_N A_S count
        (Hsmall : small (fst A_S))
        (Hbound : 0 <= eval (snd A_S) < eval N + eval B)
    : (eval (snd (redc_loop count A_S))) mod (eval N)
      = ((eval (snd A_S) + (eval (fst A_S) mod r^(Z.of_nat count))*eval B)*ri^(Z.of_nat count)) mod (eval N).
  Proof.
    induction_loop count IHcount.
    { simpl; autorewrite with zsimplify; reflexivity. }
    { simpl; rewrite snd_redc_body_mod_N
        by (apply redc_loop_good; auto).
      push_Zmod; rewrite IHcount; pull_Zmod.
      autorewrite with push_eval; [ | apply redc_loop_good; auto.. ]; [].
      match goal with
      | [ |- ?x mod ?N = ?y mod ?N ]
        => change (Z.equiv_modulo N x y)
      end.
      destruct A_S as [A S].
      cbn [fst snd].
      change (Z.pos (Pos.of_succ_nat ?n)) with (Z.of_nat (Datatypes.S n)).
      rewrite !Z.mul_add_distr_r.
      rewrite <- !Z.mul_assoc.
      replace (ri^(Z.of_nat count) * ri) with (ri^(Z.of_nat (Datatypes.S count)))
        by (change (Datatypes.S count) with (1 + count)%nat;
            autorewrite with push_Zof_nat; rewrite Z.pow_add_r by lia; simpl Z.succ; rewrite Z.pow_1_r; nia).
      rewrite <- !Z.add_assoc.
      apply Z.add_mod_Proper; [ reflexivity | ].
      unfold Z.equiv_modulo; push_Zmod; rewrite (Z.mul_mod_l (_ mod r) _ (eval N)).
      rewrite fst_redc_loop by (try apply redc_loop_good; auto; omega).
      cbn [fst].
      rewrite Z.mod_pull_div by lia.
      erewrite Z.div_to_inv_modulo;
        [
        | solve [ eassumption | apply Z.lt_gt, Z.pow_pos_nonneg; lia ]
        | erewrite <- Z.pow_mul_l, <- Z.pow_1_l;
          [ apply Z.pow_mod_Proper; [ eassumption | reflexivity ]
          | lia ] ].
      pull_Zmod.
      match goal with
      | [ |- ?x mod ?N = ?y mod ?N ]
        => change (Z.equiv_modulo N x y)
      end.
      repeat first [ rewrite <- !Z.pow_succ_r, <- !Nat2Z.inj_succ by lia
                   | rewrite (Z.mul_comm _ ri)
                   | rewrite (Z.mul_assoc _ ri _)
                   | rewrite (Z.mul_comm _ (ri^_))
                   | rewrite (Z.mul_assoc _ (ri^_) _) ].
      repeat first [ rewrite <- Z.mul_assoc
                   | rewrite <- Z.mul_add_distr_l
                   | rewrite (Z.mul_comm _ (eval B))
                   | rewrite !Nat2Z.inj_succ, !Z.pow_succ_r by lia;
                     rewrite <- Znumtheory.Zmod_div_mod by (apply Z.divide_factor_r || Z.zero_bounds)
                   | rewrite Zplus_minus
                   | reflexivity ]. }
  Qed.

  Lemma redc_bound A
        (small_A : small A)
    : 0 <= eval (redc A) < eval N + eval B.
  Proof.
    unfold redc.
    apply redc_loop_good; simpl; autorewrite with push_eval;
      rewrite ?Npos_correct; auto; lia.
  Qed.

  Lemma numlimbs_redc_gen A (small_A : small A) (Hnumlimbs : (R_numlimbs <= numlimbs B)%nat)
    : numlimbs (redc A)
      = match numlimbs A with
        | O => S (numlimbs B)
        | _ => S R_numlimbs
        end.
  Proof.
    unfold redc; rewrite numlimbs_redc_loop by (cbn [fst snd]; t_small);
      cbn [snd]; rewrite ?numlimbs_zero.
    reflexivity.
  Qed.
  Lemma numlimbs_redc A (small_A : small A) (Hnumlimbs : R_numlimbs = numlimbs B)
    : numlimbs (redc A) = S (numlimbs B).
  Proof. rewrite numlimbs_redc_gen; subst; auto; destruct (numlimbs A); reflexivity. Qed.

  Lemma redc_mod_N A (small_A : small A) (A_bound : 0 <= eval A < r ^ Z.of_nat (numlimbs A))
    : (eval (redc A)) mod (eval N) = (eval A * eval B * ri^(Z.of_nat (numlimbs A))) mod (eval N).
  Proof.
    unfold redc.
    rewrite snd_redc_loop_mod_N; cbn [fst snd];
      autorewrite with push_eval zsimplify;
      [ | rewrite ?Npos_correct; auto; lia.. ].
    Z.rewrite_mod_small.
    reflexivity.
  Qed.
End WordByWordMontgomery.