src/Arithmetic/BarrettReduction.v


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576

Require Import Coq.ZArith.ZArith Coq.micromega.Lia Crypto.Algebra.Nsatz.
Require Import Coq.derive.Derive.
Require Import Coq.Lists.List.
Require Import Crypto.Algebra.Ring.
Require Import Crypto.Arithmetic.BaseConversion.
Require Import Crypto.Arithmetic.Core.
Require Import Crypto.Arithmetic.Partition.
Require Import Crypto.Arithmetic.Saturated.
Require Import Crypto.Arithmetic.UniformWeight.
Require Import Crypto.Util.Decidable.
Require Import Crypto.Util.LetIn.
Require Import Crypto.Util.ListUtil.
Require Import Crypto.Arithmetic.BarrettReduction.Generalized.
Require Import Crypto.Util.ZUtil.Definitions.
Require Import Crypto.Util.ZUtil.AddModulo.
Require Import Crypto.Util.ZUtil.CC Crypto.Util.ZUtil.Rshi.
Require Import Crypto.Util.ZUtil.Modulo Crypto.Util.ZUtil.Div.
Require Import Crypto.Util.ZUtil.Hints.Core.
Require Import Crypto.Util.ZUtil.Modulo.PullPush.
Require Import Crypto.Util.ZUtil.Tactics.LtbToLt.
Require Import Crypto.Util.ZUtil.Tactics.RewriteModSmall.
Require Import Crypto.Util.ZUtil.Tactics.ZeroBounds.
Require Import Crypto.Util.ZUtil.Tactics.PullPush.Modulo.

Require Import Crypto.Util.Notations.
Import ListNotations. Local Open Scope Z_scope.

Section Generic.
  Context (b k M mu width : Z) (n : nat)
          (b_ok : 1 < b)
          (k_pos : 0 < k)
          (bk_eq : b^k = 2^(width * Z.of_nat n))
          (M_range : b ^ (k - 1) < M < b ^ k)
          (mu_eq : mu = b ^ (2 * k) / M)
          (width_pos : 0 < width)
          (strong_bound : b ^ 1 * (b ^ (2 * k) mod M) <= b ^ (k + 1) - mu).
  Local Notation weight := (uweight width).
  Local Notation partition := (partition weight).
  Context (q1 : list Z -> list Z)
          (q1_correct :
             forall x,
               0 <= x < b ^ (2 * k) ->
               q1 (partition (n*2)%nat x) = partition (n+1)%nat (x / b ^ (k - 1)))
          (q3 : list Z -> list Z -> list Z)
          (q3_correct :
             forall x q1,
               0 <= x < b ^ (2 * k) ->
               q1 = x / b ^ (k - 1) ->
               q3 (partition (n*2) x) (partition (n+1) q1) = partition (n+1) ((mu * q1) / b ^ (k + 1)))
          (r : list Z -> list Z -> list Z)
          (r_correct :
             forall x q3,
               0 <= x < M * b ^ k ->
               0 <= q3 ->
               (exists b : bool, q3 = x / M + (if b then -1 else 0)) ->
               r (partition (n*2) x) (partition (n+1) q3) = partition n (x mod M)).

  Context (x : Z) (x_range : 0 <= x < M * b ^ k)
          (xt : list Z) (xt_correct : xt = partition (n*2) x).

  Local Lemma M_pos : 0 < M.
  Proof. assert (0 <= b ^ (k - 1)) by Z.zero_bounds. lia. Qed.
  Local Lemma M_upper : M < weight n.
  Proof. rewrite uweight_eq_alt'. lia. Qed.
  Local Lemma x_upper : x < b ^ (2 * k).
  Proof.
    assert (0 < b ^ k) by Z.zero_bounds.
    apply Z.lt_le_trans with (m:= M * b^k); [ lia | ].
    transitivity (b^k * b^k); [ nia | ].
    rewrite <-Z.pow_2_r, <-Z.pow_mul_r by lia.
    rewrite (Z.mul_comm k 2); reflexivity.
  Qed.
  Local Lemma xmod_lt_M : x mod b ^ (k - 1) <= M.
  Proof. pose proof (Z.mod_pos_bound x (b ^ (k - 1)) ltac:(Z.zero_bounds)). lia. Qed.
  Local Hint Resolve M_pos x_upper xmod_lt_M.

  Definition reduce :=
    dlet_nd q1t := q1 xt in
    dlet_nd q3t := q3 xt q1t in
    r xt q3t.

  Lemma q1_range : 0 <= x / b^(k-1) < b^(k+1).
  Proof.
    split; [ solve [Z.zero_bounds] | ].
    assert (0 < b ^ (k - 1)) by Z.zero_bounds.
    assert (0 < b ^ k) by Z.zero_bounds.
    apply Z.div_lt_upper_bound; [ solve [Z.zero_bounds] | ].
    eapply Z.lt_le_trans with (m:=b^k * b^k);
      [ nia | autorewrite with pull_Zpow; apply Z.pow_le_mono; lia ].
  Qed.

  Lemma q3_range : 0 <= mu * (x / b ^ (k - 1)) / b ^ (k + 1).
  Proof.
    assert (0 < b ^ (k - 1)) by Z.zero_bounds.
    subst mu; Z.zero_bounds.
  Qed.

  Lemma reduce_correct : reduce = partition n (x mod M).
  Proof.
    cbv [reduce Let_In]. pose proof q3_range.
    rewrite xt_correct, q1_correct, q3_correct by auto with lia.
    assert (exists cond : bool, ((mu * (x / b^(k-1))) / b^(k+1)) = x / M + (if cond then -1 else 0)) as Hq3.
    { destruct q_nice_strong with (b:=b) (k:=k) (m:=mu) (offset:=1) (a:=x) (n:=M) as [cond Hcond];
        eauto using Z.lt_gt with zarith. }
    eauto using r_correct with lia.
  Qed.
End Generic.

(* Non-standard implementation -- uses specialized instructions and b=2 *)
Module Fancy.
  Section Fancy.
    Context (M mu width k : Z)
            (sz : nat) (sz_nz : sz <> 0%nat)
            (width_ok : 1 < width)
            (k_pos : 0 < k) (* this can be inferred from other arguments but is easier to put here for tactics *)
            (k_eq : k = width * Z.of_nat sz).
    (* sz = 1, width = k = 256 *)
    Local Notation w := (uweight width). Local Notation eval := (Positional.eval w).
    Context (mut Mt : list Z) (mut_correct : mut = partition w (sz+1) mu) (Mt_correct : Mt = partition w sz M).
    Context (mu_eq : mu = 2 ^ (2 * k) / M) (muHigh_one : mu / w sz = 1) (M_range : 2^(k-1) < M < 2^k).

    Local Lemma wprops : @weight_properties w. Proof. apply uwprops; auto with lia. Qed.
    Local Hint Resolve wprops.
    Hint Rewrite mut_correct Mt_correct : pull_partition.

    Lemma w_eq_2k : w sz = 2^k. Proof. rewrite uweight_eq_alt' by auto. congruence. Qed.
    Lemma mu_range : 2^k <= mu < 2^(k+1).
    Proof.
      rewrite mu_eq. assert (0 < 2^(k-1)) by Z.zero_bounds.
      assert (2^k < M * 2).
      { replace (2^k) with (2^(k-1+1)) by (f_equal; lia).
        rewrite Z.pow_add_r, Z.pow_1_r by lia.
        lia. }
      replace (2 ^ (2 * k)) with (2^(k+k)) by (f_equal; lia).
      rewrite !Z.pow_add_r, Z.pow_1_r by lia. split.
      { apply Z.div_le_lower_bound; nia. }
      { apply Z.div_lt_upper_bound; nia. }
    Qed.
    Lemma mu_range' : 0 <= mu < 2 * w sz.
    Proof.
      pose proof mu_range. assert (0 < 2^k) by auto with zarith.
      assert (2^(k+1) = 2 * w sz); [ | lia].
      rewrite k_eq, uweight_eq_alt'.
      rewrite Z.pow_add_r, Z.pow_1_r by lia. lia.
    Qed.
    Lemma M_range' : 0 <= M < w sz. (* more convenient form, especially for mod_small *)
    Proof. assert (0 <= 2 ^ (k-1)) by Z.zero_bounds. pose proof w_eq_2k; lia. Qed.

    Definition shiftr' (m : nat) (t : list Z) (n : Z) : list Z :=
      map (fun i => Z.rshi (2^width) (nth_default 0 t (S i)) (nth_default 0 t i) n) (seq 0 m).

    Definition shiftr (m : nat) (t : list Z) (n : Z) : list Z :=
      (* if width <= n, drop limbs first *)
      if dec (width <= n)
      then shiftr' m (skipn (Z.to_nat (n / width)) t) (n mod width)
      else shiftr' m t n.

    Definition wideadd t1 t2 := fst (Rows.add w (sz*2) t1 t2).
    Definition widesub t1 t2 := fst (Rows.sub w (sz*2) t1 t2).
    Definition widemul := BaseConversion.widemul_inlined width sz 2.
    (* widemul_inlined takes the following argument order : (width of limbs in input) (# limbs in input) (# parts to split each limb into before multiplying) *)

    Definition fill (n : nat) (a : list Z) := a ++ Positional.zeros (n - length a).
    Definition low : list Z -> list Z := firstn sz. 
    Definition high : list Z -> list Z := skipn sz.
    Definition mul_high (a b : list Z) a0b1 : list Z :=
      dlet_nd a0b0 := widemul (low a) (low b) in
      dlet_nd ab := wideadd (high a0b0 ++ high b) (fill (sz*2) (low b)) in
      wideadd ab a0b1.

    (* select based on the most significant bit of xHigh *)
    Definition muSelect xt :=
      let xHigh := nth_default 0 xt (sz*2 - 1) in
      Positional.select (Z.cc_m (2 ^ width) xHigh) (Positional.zeros sz) (low mut).

    Definition cond_sub (a y : list Z) : list Z :=
      let cond := Z.cc_l (nth_default 0 (high a) 0) in (* a[k] = least significant bit of (high a) *)
      dlet_nd maybe_y := Positional.select cond (Positional.zeros sz) y in
      dlet_nd diff := Rows.sub w sz (low a) maybe_y in (* (a mod (w sz) - y) mod (w sz)) = (a - y) mod (w sz); since we know a - y is < w sz this is okay by mod_small *)
      fst diff.

    Definition cond_subM x :=
      if Nat.eq_dec sz 1
      then [Z.add_modulo (nth_default 0 x 0) 0 M] (* use the special instruction if we can *)
      else Rows.conditional_sub w sz x Mt.

    Definition q1 (xt : list Z) := shiftr (sz+1) xt (k - 1).

    Definition q3 (xt q1t : list Z) :=
      dlet_nd muSelect := muSelect xt in (* make sure muSelect is not inlined in the output *)
      dlet_nd twoq := mul_high (fill (sz*2) mut) (fill (sz*2) q1t) (fill (sz*2) muSelect) in
      shiftr (sz+1) twoq 1.

    Definition r (xt q3t : list Z) :=
      dlet_nd r2 := widemul (low q3t) Mt in
      dlet_nd rt := widesub xt r2 in
      dlet_nd rt := cond_sub rt Mt in
      cond_subM rt.

    Section Proofs.
      Lemma shiftr'_correct m n :
        forall t tn,
          (m <= tn)%nat -> 0 <= t < w tn -> 0 <= n < width ->
          shiftr' m (partition w tn t) n = partition w m (t / 2 ^ n).
      Proof.
        cbv [shiftr']. induction m; intros; [ reflexivity | ].
        rewrite !partition_step, seq_snoc.
        autorewrite with distr_length natsimplify push_map push_nth_default.
        rewrite IHm, Z.rshi_correct, uweight_S by auto with zarith.
        rewrite <-Z.mod_pull_div by auto with zarith.
        destruct (Nat.eq_dec (S m) tn); [subst tn | ]; rewrite !nth_default_partition by omega.
        { rewrite nth_default_out_of_bounds by distr_length.
          autorewrite with zsimplify. Z.rewrite_mod_small.
          rewrite Z.div_div_comm by auto with zarith; reflexivity. }
        { repeat match goal with
                 | _ => rewrite uweight_pull_mod by auto with zarith
                 | _ => rewrite Z.mod_mod_small by auto with zarith
                 | _ => rewrite <-Znumtheory.Zmod_div_mod by (Z.zero_bounds; auto with zarith)
                 | _ => rewrite uweight_eq_alt with (n:=1%nat) by auto with zarith
                 | |- context [(t / w (S m)) mod 2^width * 2^width] =>
                   replace (t / w (S m)) with (t / w m / 2^width) by
                       (rewrite uweight_S, Z.div_div by auto with zarith; f_equal; lia);
                     rewrite Z.mod_pull_div with (b:=2^width) by auto with zarith;
                     rewrite Z.mul_div_eq' by auto with zarith
                 | _ => progress autorewrite with natsimplify zsimplify_fast zsimplify
                 end.
          replace (2^width*2^width) with (2^width*2^(width-n)*2^n) by (autorewrite with pull_Zpow; f_equal; lia).
          rewrite <-Z.mod_pull_div, <-Znumtheory.Zmod_div_mod by (Z.zero_bounds; auto with zarith).
          rewrite Z.div_div_comm by Z.zero_bounds. reflexivity. }
      Qed.
      Lemma shiftr_correct m n :
        forall t tn,
          (Z.to_nat (n / width) <= tn)%nat ->
          (m <= tn - Z.to_nat (n / width))%nat -> 0 <= t < w tn -> 0 <= n ->
          shiftr m (partition w tn t) n = partition w m (t / 2 ^ n).
      Proof.
        cbv [shiftr]; intros.
        break_innermost_match; [ | solve [auto using shiftr'_correct with zarith] ].
        pose proof (Z.mod_pos_bound n width ltac:(omega)).
        assert (t / 2 ^ (n - n mod width) < w (tn - Z.to_nat (n / width))).
        { apply Z.div_lt_upper_bound; [solve [Z.zero_bounds] | ].
          rewrite uweight_eq_alt' in *.
          rewrite <-Z.pow_add_r, Nat2Z.inj_sub, Z2Nat.id, <-Z.mul_div_eq by auto with zarith.
          autorewrite with push_Zmul zsimplify. auto with zarith. }
        repeat match goal with
               | _ => progress rewrite ?uweight_skipn_partition, ?uweight_eq_alt' by auto with lia
               | _ => rewrite Z2Nat.id by Z.zero_bounds
               | _ => rewrite Z.mul_div_eq_full by auto with zarith
               | _ => rewrite shiftr'_correct by auto with zarith
               | _ => progress rewrite ?Z.div_div, <-?Z.pow_add_r by auto with zarith
               end.
        autorewrite with zsimplify. reflexivity.
      Qed.
      Hint Rewrite shiftr_correct using (solve [auto with lia]) : pull_partition.

      (* 2 ^ (k + 1) bits fit in sz + 1 limbs because we know 2^k bits fit in sz and 1 <= width *)
      Lemma q1_correct x :
        0 <= x < w (sz * 2) ->
        q1 (partition w (sz*2)%nat x) = partition w (sz+1)%nat (x / 2 ^ (k - 1)).
      Proof.
        cbv [q1]; intros. assert (1 <= Z.of_nat sz) by (destruct sz; lia).
        assert (Z.to_nat ((k-1) / width) < sz)%nat. {
          subst k. rewrite <-Z.add_opp_r. autorewrite with zsimplify.
          apply Nat2Z.inj_lt. rewrite Z2Nat.id by lia. lia. }
        assert (0 <= k - 1) by nia.
        autorewrite with pull_partition. reflexivity.
      Qed.

      Lemma low_correct n a : (sz <= n)%nat -> low (partition w n a) = partition w sz a.
      Proof. cbv [low]; auto using uweight_firstn_partition with lia. Qed.
      Lemma high_correct a : high (partition w (sz*2) a) = partition w sz (a / w sz).
      Proof. cbv [high]. rewrite uweight_skipn_partition by lia. f_equal; lia. Qed.
      Lemma fill_correct n m a :
        (n <= m)%nat ->
        fill m (partition w n a) = partition w m (a mod w n).
      Proof.
        cbv [fill]; intros. distr_length.
        rewrite <-partition_0 with (weight:=w).
        rewrite uweight_partition_app by lia.
        f_equal; lia.
      Qed.
      Hint Rewrite low_correct high_correct fill_correct using lia : pull_partition.

      Lemma wideadd_correct a b :
        wideadd (partition w (sz*2) a) (partition w (sz*2) b) = partition w (sz*2) (a + b).
      Proof.
        cbv [wideadd]. rewrite Rows.add_partitions by (distr_length; auto).
        autorewrite with push_eval.
        apply partition_eq_mod; auto with zarith.
      Qed.
      Lemma widesub_correct a b :
        widesub (partition w (sz*2) a) (partition w (sz*2) b) = partition w (sz*2) (a - b).
      Proof.
        cbv [widesub]. rewrite Rows.sub_partitions by (distr_length; auto).
        autorewrite with push_eval.
        apply partition_eq_mod; auto with zarith.
      Qed.
      Lemma widemul_correct a b :
        widemul (partition w sz a) (partition w sz b) = partition w (sz*2) ((a mod w sz) * (b mod w sz)).
      Proof.
        cbv [widemul]. rewrite BaseConversion.widemul_inlined_correct; (distr_length; auto).
        autorewrite with push_eval. reflexivity.
      Qed.
      Hint Rewrite widemul_correct widesub_correct wideadd_correct using lia : pull_partition.

      Lemma mul_high_idea d a b a0 a1 b0 b1 :
        d <> 0 ->
        a = d * a1 + a0 ->
        b = d * b1 + b0 ->
        (a * b) / d = a0 * b0 / d + d * a1 * b1 + a1 * b0 + a0 * b1.
      Proof.
        intros. subst a b. autorewrite with push_Zmul.
        ring_simplify_subterms. rewrite Z.pow_2_r.
        rewrite Z.div_add_exact by (push_Zmod; autorewrite with zsimplify; omega).
        repeat match goal with
               | |- context [d * ?a * ?b * ?c] =>
                 replace (d * a * b * c) with (a * b * c * d) by ring
               | |- context [d * ?a * ?b] =>
                 replace (d * a * b) with (a * b * d) by ring
               end.
        rewrite !Z.div_add by omega.
        autorewrite with zsimplify.
        rewrite (Z.mul_comm a0 b0).
        ring_simplify. ring.
      Qed.

      Lemma mul_high_correct a b
            (Ha : a / w sz = 1)
            a0b1 (Ha0b1 : a0b1 = a mod w sz * (b / w sz)) :
        mul_high (partition w (sz*2) a) (partition w (sz*2) b) (partition w (sz*2) a0b1) =
        partition w (sz*2) (a * b / w sz).
      Proof.
        cbv [mul_high Let_In].
        erewrite mul_high_idea by auto using Z.div_mod with zarith.
        repeat match goal with
               | _ => progress autorewrite with pull_partition
               | _ => progress rewrite ?Ha, ?Ha0b1
               | _ => rewrite uweight_partition_app by lia;
                        replace (sz+sz)%nat with (sz*2)%nat by lia
               | _ => rewrite Z.mod_pull_div by auto with zarith
               | _ => progress Z.rewrite_mod_small
               | _ => f_equal; ring
               end.
      Qed.

      Hint Rewrite uweight_S uweight_eq_alt' using lia : weight_to_pow.
      Hint Rewrite <-uweight_S uweight_eq_alt' using lia : pow_to_weight.

      Lemma q1_range x :
        0 <= x < w (sz * 2) ->
        0 <= x / 2 ^ (k-1) < 2 * w sz.
      Proof.
        intros; split; [ solve [Z.zero_bounds] | ].
        apply Z.div_lt_upper_bound; [ solve [Z.zero_bounds] | ].
        assert (w (sz * 2) <= 2 ^ (k-1) * (2 * w sz)); [ | lia ].
        autorewrite with weight_to_pow pull_Zpow.
        apply Z.pow_le_mono_r; lia.
      Qed.

      Lemma muSelect_correct x :
        0 <= x < w (sz * 2) ->
        muSelect (partition w (sz*2) x) = partition w sz (mu mod (w sz) * (x / 2 ^ (k - 1) / (w sz))).
      Proof.
        cbv [muSelect]; intros;
          repeat match goal with
                 | _ => progress autorewrite with pull_partition natsimplify
                 | _ => progress rewrite ?Z.cc_m_eq by auto with zarith
                 | _ => erewrite Positional.select_eq by (distr_length; eauto)
                 | _ => rewrite nth_default_partition by lia
                 | _ => progress replace (S (sz * 2 - 1)) with (sz * 2)%nat by lia
                 | H : 0 <= ?x < ?m |- context [?x mod ?m] => rewrite (Z.mod_small x m) by auto with zarith
                 end.
        replace (x / (w (sz * 2 - 1)) / (2 ^ width / 2)) with (x / (2 ^ (k - 1)) / w sz) by
            (autorewrite with weight_to_pow pull_Zpow;
             rewrite !Z.div_div, <-!Z.pow_add_r by (Core.zutil_arith || Z.zero_bounds); do 2 f_equal; nia).
        rewrite Z.div_between_0_if with (a:=x / 2^(k-1)) by (Z.zero_bounds; auto using q1_range).
        break_innermost_match; try lia; autorewrite with zsimplify_fast; [ | ].
        { apply partition_eq_mod; auto with zarith. }
        { rewrite partition_0; reflexivity. }
      Qed.
      Hint Rewrite muSelect_correct using lia : pull_partition.

      Lemma mu_q1_range x (Hx : 0 <= x < w (sz * 2)) : mu * (x / 2^(k-1)) < w sz * w (sz * 2).
      Proof.
        pose proof mu_range'. pose proof q1_range x ltac:(lia).
        replace (w (sz * 2)) with (w sz * w sz) by
            (autorewrite with weight_to_pow pull_Zpow; f_equal; lia).
        apply Z.lt_le_trans with (m:= 2 * w sz * (2 * w sz)); [ nia | ].
        assert (4 <= w sz); [ | nia ]. change 4 with (Z.pow 2 2).
        autorewrite with weight_to_pow. apply Z.pow_le_mono_r; nia.
      Qed.

      Lemma q3_correct x (Hx : 0 <= x < w (sz * 2)) q1 (Hq1 : q1 = x / 2 ^ (k - 1)) :
        q3 (partition w (sz*2) x) (partition w (sz+1) q1) = partition w (sz+1) ((mu*q1) / 2 ^ (k + 1)).
      Proof.
        cbv [q3 Let_In]. intros. pose proof mu_q1_range x ltac:(lia).
        pose proof mu_range'. pose proof q1_range x ltac:(lia).
        autorewrite with pull_partition pull_Zmod.
        assert (2 * w sz < w (sz + 1)) by (autorewrite with weight_to_pow pull_Zpow; auto with zarith lia).
        Z.rewrite_mod_small. rewrite <-Hq1 in *.
        rewrite mul_high_correct by
            (try lia; rewrite Z.div_between_0_if with (a:=q1) by lia;
             break_innermost_match; autorewrite with zsimplify; reflexivity).
        rewrite shiftr_correct by (rewrite ?Z.div_small, ?Z2Nat.inj_0 by lia; auto with zarith lia).
        autorewrite with weight_to_pow pull_Zpow pull_Zdiv.
        rewrite !Z.div_div, <-!Z.pow_add_r by (Core.zutil_arith || Z.zero_bounds).
        congruence.
      Qed.

      Lemma cond_sub_correct a b :
        cond_sub (partition w (sz*2) a) (partition w sz b)
        = partition w sz (if dec ((a / w sz) mod 2 = 0)
                          then a
                          else a - b).
      Proof.
        intros; cbv [cond_sub Let_In Z.cc_l]. autorewrite with pull_partition.
        rewrite nth_default_partition by lia.
        rewrite weight_0 by auto. autorewrite with zsimplify_fast.
        rewrite uweight_eq_alt' with (n:=1%nat). autorewrite with push_Zof_nat zsimplify.
        rewrite <-Znumtheory.Zmod_div_mod by auto using Zpow_facts.Zpower_divide with zarith.
        rewrite Positional.select_eq with (n:=sz) by (distr_length; apply w).
        rewrite Rows.sub_partitions by (break_innermost_match; distr_length; auto).
        break_innermost_match; autorewrite with push_eval zsimplify_fast;
          apply partition_eq_mod; auto with zarith.
      Qed.
      Hint Rewrite cond_sub_correct : pull_partition.
      Lemma cond_subM_correct a :
        cond_subM (partition w sz a)
        = partition w sz (if dec (a mod w sz < M)
                          then a
                          else a - M).
      Proof.
        cbv [cond_subM]. autorewrite with pull_partition. pose proof M_range'.
        rewrite Rows.conditional_sub_partitions by
            (distr_length; auto; autorewrite with push_eval; try apply partition_eq_mod; auto with zarith).
        rewrite nth_default_partition, weight_0, Z.add_modulo_correct by auto with lia.
        autorewrite with zsimplify_fast push_eval. Z.rewrite_mod_small.
        pose proof Z.mod_pos_bound a (w 1) ltac:(auto).
        break_innermost_match; Z.ltb_to_lt;
          repeat match goal with
                 | _ => lia 
                 | _ => reflexivity
                 | _ => apply partition_eq_mod; solve [auto with zarith]
                 | _ => rewrite partition_step, weight_0 by auto
                 | _ => progress autorewrite with zsimplify_fast
                 | _ => progress Z.rewrite_mod_small
                 | _ => rewrite Z.sub_mod_l with (a:=a)
                 end.
      Qed.
      Hint Rewrite cond_subM_correct : pull_partition.

      Lemma w_eq_22k : w (sz * 2) = 2 ^ (2 * k).
      Proof.
        replace (sz * 2)%nat with (sz + sz)%nat by lia.
        rewrite uweight_sum_indices, w_eq_2k, <-Z.pow_add_r by lia.
        f_equal; lia.
      Qed.

      Lemma r_idea x q3 (b:bool) :
        0 <= x < M * 2 ^ k ->
        0 <= q3 ->
        q3 = x / M + (if b then -1 else 0) ->
        x - q3 mod w sz * M = x mod M + (if b then M else 0). 
      Proof.
        intros. assert (0 < 2^(k-1)) by Z.zero_bounds.
        assert (q3 < w sz).
        { apply Z.le_lt_trans with (m:=x/M); [ subst q3; break_innermost_match; lia | ].
          autorewrite with weight_to_pow. rewrite <-k_eq. auto with zarith. }
        Z.rewrite_mod_small.
        repeat match goal with
               | _ => progress autorewrite with push_Zmul
               | H : q3 = ?e |- _ => progress replace (q3 * M) with (e * M)  by (rewrite H; reflexivity)
               | _ => rewrite (Z.mul_div_eq' x M) by lia
               end.
        break_innermost_match; Z.ltb_to_lt; lia.
      Qed.

      Lemma r_correct x q3 :
        0 <= x < M * 2 ^ k ->
        0 <= q3 ->
        (exists b : bool, q3 = x / M + (if b then -1 else 0)) ->
        r (partition w (sz*2) x) (partition w (sz+1) q3) = partition w sz (x mod M).
      Proof.
        intros; cbv [r Let_In]. pose proof M_range'. assert (0 < 2^(k-1)) by Z.zero_bounds.
        autorewrite with pull_partition. Z.rewrite_mod_small.
        match goal with H : exists _, q3 = _ |- _ => destruct H end.
        erewrite r_idea by eassumption.
        pose proof (Z.mod_pos_bound x M ltac:(lia)).
        rewrite Z.div_between_0_if with (b:=w sz) by (break_innermost_match; auto with zarith).
        rewrite Z.mod_small with (b:=2) by (break_innermost_match; lia).
        break_innermost_match; Z.ltb_to_lt; try lia; autorewrite with zsimplify_fast;
          repeat match goal with
                 | |- exists e, _ /\ _ /\ ?f ?x = ?f e => exists x; split; [ | split ]
                 | _ => rewrite Z.mod_small in * by lia
                 | _ => progress Z.rewrite_mod_small
                 | _ => progress (push_Zmod; pull_Zmod); autorewrite with zsimplify_fast
                 | _ => lia 
                 | _ => reflexivity
                 end.
      Qed.
    End Proofs.

    Section Def.
      Context (sz_eq_1 : sz = 1%nat). (* this is needed to get rid of branches in the templates; a different definition would be needed for sizes other than 1, but would be able to use the same proofs. *)
      Local Hint Resolve q1_correct q3_correct r_correct.

      (* muselect relies on an initially-set flag, so pull it out of q3 *)
      Definition fancy_reduce_muSelect_first xt :=
        dlet_nd muSelect := muSelect xt in
        dlet_nd q1t := q1 xt in
        dlet_nd twoq := mul_high (fill (sz * 2) mut) (fill (sz * 2) q1t) (fill (sz * 2) muSelect) in
        dlet_nd q3t := shiftr (sz+1) twoq 1 in
        r xt q3t.

      Lemma fancy_reduce_muSelect_first_correct x :
        0 <= x < M * 2^k ->
        2 * (2 ^ (2 * k) mod M) <= 2 ^ (k + 1) - mu ->
        fancy_reduce_muSelect_first (partition w (sz*2) x) = partition w sz (x mod M).
      Proof.
        intros. pose proof w_eq_22k.
        erewrite <-reduce_correct with (b:=2) (k:=k) (mu:=mu) by
            (eauto with nia; intros; try rewrite q3'_correct; try rewrite <-k_eq; eauto with nia ).
        reflexivity.
      Qed.

      Derive fancy_reduce'
             SuchThat (
               forall x,
                 0 <= x < M * 2^k ->
                 2 * (2 ^ (2 * k) mod M) <= 2 ^ (k + 1) - mu ->
                 fancy_reduce' (partition w (sz*2) x) = partition w sz (x mod M))
             As fancy_reduce'_correct.
      Proof.
        intros. assert (k = width) as width_eq_k by nia.
        erewrite <-fancy_reduce_muSelect_first_correct by nia.
        cbv [fancy_reduce_muSelect_first q1 q3 shiftr r cond_subM].
        break_match; try solve [exfalso; lia].
        match goal with |- ?g ?x = ?rhs =>
                        let f := (match (eval pattern x in rhs) with ?f _ => f end) in
                        assert (f = g); subst fancy_reduce'; reflexivity
        end.
      Qed.

      Definition fancy_reduce xLow xHigh := hd 0 (fancy_reduce' [xLow;xHigh]).

      Lemma partition_2 xLow xHigh :
        0 <= xLow < 2 ^ k ->
        0 <= xHigh < M ->
        partition w 2 (xLow + 2^k * xHigh) = [xLow;xHigh].
      Proof.
        replace k with width in M_range |- * by nia; intros. cbv [partition map seq].
        rewrite !uweight_S, !weight_0 by auto with zarith lia.
        autorewrite with zsimplify.
        rewrite <-Z.mod_pull_div by Z.zero_bounds.
        autorewrite with zsimplify. reflexivity.
      Qed.

      Lemma fancy_reduce_correct xLow xHigh :
        0 <= xLow < 2 ^ k ->
        0 <= xHigh < M ->
        2 * (2 ^ (2 * k) mod M) <= 2 ^ (k + 1) - mu ->
        fancy_reduce xLow xHigh = (xLow + 2^k * xHigh) mod M.
      Proof.
        assert (M < 2^width) by  (replace width with k by nia; lia).
        assert (0 < 2 ^ (k - 1)) by Z.zero_bounds.
        pose proof (Z.mod_pos_bound (xLow + 2^k * xHigh) M ltac:(lia)).
        intros. cbv [fancy_reduce]. rewrite <-partition_2 by lia.
        replace 2%nat with (sz*2)%nat by lia.
        rewrite fancy_reduce'_correct by nia.
        rewrite sz_eq_1; cbv [partition map seq hd].
        rewrite !uweight_S, !weight_0 by auto with zarith lia.
        autorewrite with zsimplify. reflexivity.
      Qed.
    End Def.
  End Fancy.
End Fancy.