aboutsummaryrefslogtreecommitdiff
path: root/src/Specific/NISTP256/AMD64
diff options
context:
space:
mode:
authorGravatar Andres Erbsen <andreser@mit.edu>2017-06-22 17:07:16 -0400
committerGravatar Andres Erbsen <andreser@mit.edu>2017-06-22 17:07:22 -0400
commit3bc1dda17334225a2970fd31ef1c92862799b9a1 (patch)
treed29522cda123cb741602211bab2a385833fc1cc1 /src/Specific/NISTP256/AMD64
parente8eba53e187e26f28870e9070360810dbdcbe6b8 (diff)
move Specifi p256 files into their own directory
Diffstat (limited to 'src/Specific/NISTP256/AMD64')
-rw-r--r--src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s240
-rw-r--r--src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v108
-rw-r--r--src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log113
-rw-r--r--src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v4
-rw-r--r--src/Specific/NISTP256/AMD64/MontgomeryP256.v157
5 files changed, 622 insertions, 0 deletions
diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s
new file mode 100644
index 000000000..00472462f
--- /dev/null
+++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s
@@ -0,0 +1,240 @@
+.intel_syntax noprefix
+.section .text
+.globl IntegrationTestMontgomeryP256
+.type IntegrationTestMontgomeryP256, @function
+IntegrationTestMontgomeryP256:
+#icc17 -m64 -O3 -madx -march=skylake -mtune=skylake -fomit-frame-pointer -fno-stack-protector-all -opt-report
+#IntegrationTestMontgomeryP256(unsigned long*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long):
+ push r12 #13.1
+ push r13 #13.1
+ push r14 #13.1
+ push r15 #13.1
+ push rbx #13.1
+ push rbp #13.1
+ mov rbx, r9 #13.1
+ mov QWORD PTR [-32+rsp], rdx #13.1[spill]
+ mov rdx, r8 #13.33
+ mov QWORD PTR [-16+rsp], rdi #13.1[spill]
+ mov r11, -1 #21.33
+ xor edi, edi #17.32
+ mov r9, QWORD PTR [72+rsp] #13.1
+ mulx rax, rbp, QWORD PTR [64+rsp] #14.33
+ mulx r15, r10, r9 #13.33
+ adcx r10, rax #17.32
+ mov rax, 0x0ffffffff #22.33
+ mov QWORD PTR [-24+rsp], rsi #13.1[spill]
+ mulx r14, rsi, QWORD PTR [56+rsp] #15.33
+ adcx rbp, r14 #18.32
+ mulx r8, r13, rbx #16.33
+ mov rdx, r15 #21.33
+ adcx rsi, r8 #19.32
+ mov QWORD PTR [-40+rsp], rbx #13.1[spill]
+ mov ebx, 0 #19.32
+ mov r8d, ebx #20.30
+ setb dil #19.32
+ adox r8d, ebx #20.30
+ mov r8d, ebx #20.30
+ adox rdi, r13 #20.30
+ mulx r12, r11, r11 #21.33
+ seto r8b #20.30
+ clc #24.32
+ mulx r8, r13, rax #22.33
+ mov rax, 0xffffffff00000001 #23.33
+ adcx r11, r8 #24.32
+ mov r8d, 0 #25.32
+ mulx r14, rax, rax #23.33
+ mov edx, ebx #27.30
+ adcx r13, r8 #25.32
+ adcx r14, r8 #26.32
+ mov r8d, ebx #26.32
+ setb r8b #26.32
+ adox edx, ebx #27.30
+ mov rdx, rcx #33.33
+ adox r8, rax #27.30
+ mov ecx, ebx #37.32
+ clc #28.30
+ adcx r15, r12 #28.30
+ mulx r12, rax, r9 #33.33
+ adcx r10, r11 #29.32
+ adcx rbp, r13 #30.32
+ mulx r13, r15, QWORD PTR [56+rsp] #35.33
+ adcx rsi, r14 #31.32
+ mulx r14, r9, QWORD PTR [64+rsp] #34.33
+ adcx rdi, r8 #32.32
+ mov r8d, ebx #32.32
+ mulx rdx, r11, QWORD PTR [-40+rsp] #36.33[spill]
+ setb r8b #32.32
+ adox ecx, ebx #37.32
+ mov rcx, -1 #46.35
+ adox rax, r14 #37.32
+ mov r14d, ebx #39.32
+ adox r9, r13 #38.32
+ adox r15, rdx #39.32
+ seto r14b #39.32
+ clc #40.30
+ adcx r14, r11 #40.30
+ mov r11d, ebx #41.34
+ adox r11d, ebx #41.34
+ mov r11d, ebx #45.34
+ adox r10, r12 #41.34
+ mov rdx, r10 #46.35
+ adox rbp, rax #42.34
+ mov rax, 0x0ffffffff #47.35
+ mulx r12, rax, rax #47.35
+ adox rsi, r9 #43.34
+ mov r9, 0xffffffff00000001 #48.35
+ adox rdi, r15 #44.34
+ adox r8, r14 #45.34
+ mulx r14, r13, rcx #46.35
+ seto r11b #45.34
+ clc #49.34
+ mulx r15, rcx, r9 #48.35
+ mov r9d, ebx #52.31
+ adcx r13, r12 #49.34
+ mov r12d, 0 #50.34
+ mov rdx, QWORD PTR [-32+rsp] #59.35[spill]
+ adcx rax, r12 #50.34
+ adcx r15, r12 #51.34
+ mov r12d, ebx #51.34
+ setb r12b #51.34
+ adox r9d, ebx #52.31
+ adox r12, rcx #52.31
+ clc #53.31
+ adcx r10, r14 #53.31
+ mov r10d, ebx #57.34
+ adcx rbp, r13 #54.34
+ mulx r14, r13, QWORD PTR [72+rsp] #59.35
+ adcx rsi, rax #55.34
+ mulx rax, r9, QWORD PTR [56+rsp] #61.35
+ adcx rdi, r15 #56.34
+ mov r15d, ebx #63.34
+ adcx r8, r12 #57.34
+ setb r10b #57.34
+ adox r15d, ebx #63.34
+ mov DWORD PTR [-8+rsp], r10d #57.34[spill]
+ mulx r10, r12, QWORD PTR [64+rsp] #60.35
+ adox r13, r10 #63.34
+ mov r10d, ebx #67.34
+ mulx rdx, rcx, QWORD PTR [-40+rsp] #62.35[spill]
+ adox r12, rax #64.34
+ mov eax, ebx #65.34
+ adox r9, rdx #65.34
+ seto al #65.34
+ clc #66.31
+ adcx rax, rcx #66.31
+ adox r10d, ebx #67.34
+ mov ecx, DWORD PTR [-8+rsp] #71.34[spill]
+ adox rbp, r14 #67.34
+ mov r14d, ebx #70.34
+ mov rdx, rbp #72.35
+ adox rsi, r13 #68.34
+ adox rdi, r12 #69.34
+ adox r8, r9 #70.34
+ mov r9, 0xffffffff00000001 #74.35
+ mulx r10, r9, r9 #74.35
+ seto r14b #70.34
+ xor r15d, r15d #70.34
+ add ecx, r11d #71.34
+ cmp ebx, r14d #71.34
+ mov r11, 0x0ffffffff #73.35
+ mulx r11, r14, r11 #73.35
+ adcx rcx, rax #71.34
+ mov rax, -1 #72.35
+ mulx r13, r12, rax #72.35
+ mov edx, ebx #75.34
+ setb r15b #71.34
+ adox edx, ebx #75.34
+ mov rdx, QWORD PTR [-24+rsp] #85.35[spill]
+ adox r12, r11 #75.34
+ mov r11d, 0 #76.34
+ adox r14, r11 #76.34
+ adox r10, r11 #77.34
+ mov r11d, ebx #77.34
+ seto r11b #77.34
+ clc #78.31
+ adcx r11, r9 #78.31
+ mov r9d, ebx #79.31
+ adox r9d, ebx #79.31
+ adox rbp, r13 #79.31
+ mulx r13, r9, QWORD PTR [64+rsp] #86.35
+ adox rsi, r12 #80.34
+ adox rdi, r14 #81.34
+ adox r8, r10 #82.34
+ mov r10d, ebx #83.34
+ adox rcx, r11 #83.34
+ mulx r11, rbp, QWORD PTR [72+rsp] #85.35
+ seto r10b #83.34
+ clc #89.34
+ adcx rbp, r13 #89.34
+ mulx r14, r13, QWORD PTR [56+rsp] #87.35
+ adcx r9, r14 #90.34
+ mulx r14, r12, QWORD PTR [-40+rsp] #88.35[spill]
+ mov edx, ebx #92.31
+ adcx r13, r14 #91.34
+ mov r14d, ebx #91.34
+ setb r14b #91.34
+ adox edx, ebx #92.31
+ adox r14, r12 #92.31
+ mov r12d, ebx #92.31
+ seto r12b #92.31
+ clc #93.34
+ adcx rsi, r11 #93.34
+ mov r11, 0x0ffffffff #99.35
+ mov rdx, rsi #98.35
+ adcx rdi, rbp #94.34
+ mov ebp, ebx #96.34
+ adcx r8, r9 #95.34
+ mulx r9, r11, r11 #99.35
+ adcx rcx, r13 #96.34
+ mulx r12, r13, rax #98.35
+ setb bpl #96.34
+ add r10d, r15d #97.34
+ cmp ebx, ebp #97.34
+ mov rbp, 0xffffffff00000001 #100.35
+ mulx rbp, r15, rbp #100.35
+ mov edx, ebx #101.34
+ adcx r10, r14 #97.34
+ mov r14d, ebx #97.34
+ setb r14b #97.34
+ adox edx, ebx #101.34
+ mov edx, 0 #102.34
+ adox r13, r9 #101.34
+ mov r9d, ebx #103.34
+ adox r11, rdx #102.34
+ adox rbp, rdx #103.34
+ seto r9b #103.34
+ clc #104.31
+ adcx r9, r15 #104.31
+ mov r15d, ebx #105.31
+ adox r15d, ebx #105.31
+ adox rsi, r12 #105.31
+ mov esi, ebx #109.34
+ adox rdi, r13 #106.34
+ adox r8, r11 #107.34
+ adox rcx, rbp #108.34
+ adox r10, r9 #109.34
+ seto sil #109.34
+ xor ebp, ebp #109.34
+ add esi, r14d #110.26
+ mov rsi, QWORD PTR [-16+rsp] #122.1[spill]
+ cmove rax, rdx #111.40
+ sub rdi, rax #113.34
+ mov QWORD PTR [24+rsi], rdi #122.1
+ mov edi, eax #115.34
+ sbb r8, rdi #115.34
+ mov QWORD PTR [16+rsi], r8 #121.1
+ sbb rcx, rdx #116.34
+ mov rdx, 0xffffffff00000001 #118.31
+ mov QWORD PTR [8+rsi], rcx #120.1
+ setb bpl #116.34
+ and rdx, rax #118.31
+ cmp ebx, ebp #118.31
+ sbb r10, rdx #118.31
+ mov QWORD PTR [rsi], r10 #119.1
+ pop rbp #123.106
+ pop rbx #123.106
+ pop r15 #123.106
+ pop r14 #123.106
+ pop r13 #123.106
+ pop r12 #123.106
+ ret #123.106
diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v
new file mode 100644
index 000000000..313b066da
--- /dev/null
+++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v
@@ -0,0 +1,108 @@
+Require Import Coq.ZArith.ZArith.
+Require Import Coq.Lists.List.
+Local Open Scope Z_scope.
+
+Require Import Crypto.Arithmetic.Core.
+Require Import Crypto.Util.FixedWordSizes.
+Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Definition.
+Require Import Crypto.Arithmetic.Core. Import B.
+Require Import Crypto.Specific.NISTP256.AMD64.MontgomeryP256.
+Require Import Crypto.Arithmetic.PrimeFieldTheorems.
+Require Import Crypto.Util.Tuple Crypto.Util.Sigma Crypto.Util.Sigma.MapProjections Crypto.Util.Sigma.Lift Crypto.Util.Notations Crypto.Util.ZRange Crypto.Util.BoundedWord.
+Require Import Crypto.Util.Tactics.Head.
+Require Import Crypto.Util.Tactics.MoveLetIn.
+Require Import Crypto.Util.Tactics.DestructHead.
+Import ListNotations.
+
+Require Import Crypto.Specific.IntegrationTestTemporaryMiscCommon.
+
+Require Import Crypto.Compilers.Z.Bounds.Pipeline.
+
+Section BoundedField25p5.
+ Local Coercion Z.of_nat : nat >-> Z.
+
+ Let bounds : Tuple.tuple zrange sz
+ := Eval compute in
+ Tuple.repeat {| lower := 0 ; upper := r-1 |} sz.
+
+ Let lgbitwidth := Eval compute in (Z.to_nat (Z.log2_up (Z.log2_up r))).
+ Let bitwidth := Eval compute in (2^lgbitwidth)%nat.
+ Let feZ : Type := tuple Z sz.
+ Let feW : Type := tuple (wordT lgbitwidth) sz.
+ Let feBW : Type := BoundedWord sz bitwidth bounds.
+ Let phi : feBW -> F m :=
+ fun x => montgomery_to_F (Saturated.eval (Z.pos r) (BoundedWordToZ _ _ _ x)).
+
+ (* TODO : change this to field once field isomorphism happens *)
+ Definition mul
+ : { mul : feBW -> feBW -> feBW
+ | forall A B, phi (mul A B) = F.mul (phi A) (phi B) }.
+ Proof.
+ lazymatch goal with
+ | [ |- { f | forall a b, ?phi (f a b) = @?rhs a b } ]
+ => apply lift2_sig with (P:=fun a b f => phi f = rhs a b)
+ end.
+ intros a b.
+ eexists_sig_etransitivity. all:cbv [phi].
+ rewrite <- (proj2_sig mulmod_256)
+ by (hnf; rewrite <- (is_bounded_by_None_repeat_In_iff_lt _ _ _); destruct_head' feBW; assumption).
+ (*symmetry; rewrite <- (proj2_sig carry_sig); symmetry.
+ set (carry_mulZ := fun a b => proj1_sig carry_sig (proj1_sig mul_sig a b)).
+ change (proj1_sig carry_sig (proj1_sig mul_sig ?a ?b)) with (carry_mulZ a b).*)
+ set (mulmodZ := proj1_sig mulmod_256).
+ context_to_dlet_in_rhs mulmodZ; cbv [mulmodZ].
+ cbv beta iota delta [mulmod_256 mulmod_256'' mulmod_256' proj1_sig Saturated.T lift2_sig fst snd runtime_add runtime_and runtime_mul runtime_opp runtime_shr sz].
+ reflexivity.
+ sig_dlet_in_rhs_to_context.
+ do 2 apply (fun f => proj2_sig_map (fun THIS_NAME_MUST_NOT_BE_UNDERSCORE_TO_WORK_AROUND_CONSTR_MATCHING_ANAOMLIES___BUT_NOTE_THAT_IF_THIS_NAME_IS_LOWERCASE_A___THEN_REIFICATION_STACK_OVERFLOWS___AND_I_HAVE_NO_IDEA_WHATS_GOING_ON p => f_equal f p)).
+ (* jgross start here! *)
+ Set Ltac Profiling.
+ Time refine_reflectively_with_uint8_with anf. (* Finished transaction in 212.693 secs (212.576u,0.184s) (successful) *)
+ Show Ltac Profile.
+ (* total time: 19.632s
+
+ tactic local total calls max
+────────────────────────────────────────┴──────┴──────┴───────┴─────────┘
+─refine_reflectively ------------------- 0.0% 98.4% 1 19.320s
+─ReflectiveTactics.do_reflective_pipelin -0.0% 96.2% 1 18.884s
+─ReflectiveTactics.solve_side_conditions 1.2% 96.1% 1 18.860s
+─ReflectiveTactics.do_reify ------------ 27.7% 44.0% 1 8.640s
+─ReflectiveTactics.abstract_vm_compute_r 12.3% 13.9% 2 2.024s
+─ReflectiveTactics.abstract_vm_compute_r 8.9% 12.2% 2 1.576s
+─Reify_rhs_gen ------------------------- 0.8% 11.7% 1 2.300s
+─ReflectiveTactics.renamify_rhs -------- 10.4% 11.5% 1 2.260s
+─ReflectiveTactics.abstract_rhs -------- 4.6% 5.8% 1 1.148s
+─clear (var_list) ---------------------- 5.2% 5.2% 57 0.184s
+─eexact -------------------------------- 4.1% 4.1% 68 0.028s
+─prove_interp_compile_correct ---------- 0.0% 3.4% 1 0.664s
+─ReflectiveTactics.abstract_cbv_interp_r 2.7% 3.3% 1 0.648s
+─unify (constr) (constr) --------------- 3.2% 3.2% 6 0.248s
+─rewrite ?EtaInterp.InterpExprEta ------ 3.1% 3.1% 1 0.612s
+─ReflectiveTactics.abstract_cbv_rhs ---- 2.0% 2.7% 1 0.532s
+─Glue.refine_to_reflective_glue -------- 0.0% 2.2% 1 0.436s
+─rewrite H ----------------------------- 2.1% 2.1% 1 0.420s
+
+ tactic local total calls max
+────────────────────────────────────────┴──────┴──────┴───────┴─────────┘
+─refine_reflectively ------------------- 0.0% 98.4% 1 19.320s
+ ├─ReflectiveTactics.do_reflective_pipel -0.0% 96.2% 1 18.884s
+ │└ReflectiveTactics.solve_side_conditio 1.2% 96.1% 1 18.860s
+ │ ├─ReflectiveTactics.do_reify -------- 27.7% 44.0% 1 8.640s
+ │ │ ├─Reify_rhs_gen ------------------- 0.8% 11.7% 1 2.300s
+ │ │ │ ├─prove_interp_compile_correct -- 0.0% 3.4% 1 0.664s
+ │ │ │ │└rewrite ?EtaInterp.InterpExprEt 3.1% 3.1% 1 0.612s
+ │ │ │ └─rewrite H --------------------- 2.1% 2.1% 1 0.420s
+ │ │ └─eexact -------------------------- 4.1% 4.1% 68 0.028s
+ │ ├─ReflectiveTactics.abstract_vm_compu 12.3% 13.9% 2 2.024s
+ │ ├─ReflectiveTactics.abstract_vm_compu 8.9% 12.2% 2 1.576s
+ │ ├─ReflectiveTactics.renamify_rhs ---- 10.4% 11.5% 1 2.260s
+ │ ├─ReflectiveTactics.abstract_rhs ---- 4.6% 5.8% 1 1.148s
+ │ ├─ReflectiveTactics.abstract_cbv_inte 2.7% 3.3% 1 0.648s
+ │ └─ReflectiveTactics.abstract_cbv_rhs 2.0% 2.7% 1 0.532s
+ └─Glue.refine_to_reflective_glue ------ 0.0% 2.2% 1 0.436s
+*)
+ Time Defined. (* Finished transaction in 21.291 secs (21.231u,0.032s) (successful) *)
+
+Time End BoundedField25p5. (* Finished transaction in 14.666 secs (14.556u,0.111s) (successful) *)
+
+Print Assumptions mul.
diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log
new file mode 100644
index 000000000..48cc4785d
--- /dev/null
+++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log
@@ -0,0 +1,113 @@
+λ x x0 : word64 * word64 * word64 * word64,
+Interp-η
+(λ var : Syntax.base_type → Type,
+ λ '(x8, x9, x7, x5, (x14, x15, x13, x11))%core,
+ uint64_t x17, uint64_t x18 = mulx_u64(x5, x11);
+ uint64_t x20, uint64_t x21 = mulx_u64(x5, x13);
+ uint64_t x23, uint64_t x24 = mulx_u64(x5, x15);
+ uint64_t x26, uint64_t x27 = mulx_u64(x5, x14);
+ uint64_t x29, uint8_t x30 = addcarryx_u64(0x0, x18, x20);
+ uint64_t x32, uint8_t x33 = addcarryx_u64(x30, x21, x23);
+ uint64_t x35, uint8_t x36 = addcarryx_u64(x33, x24, x26);
+ uint64_t x38, uint8_t _ = addcarryx_u64(0x0, x36, x27);
+ uint64_t x41, uint64_t x42 = mulx_u64(x17, 0xffffffffffffffffL);
+ uint64_t x44, uint64_t x45 = mulx_u64(x17, 0xffffffff);
+ uint64_t x47, uint64_t x48 = mulx_u64(x17, 0xffffffff00000001L);
+ uint64_t x50, uint8_t x51 = addcarryx_u64(0x0, x42, x44);
+ uint64_t x53, uint8_t x54 = addcarryx_u64(x51, x45, 0x0);
+ uint64_t x56, uint8_t x57 = addcarryx_u64(x54, 0x0, x47);
+ uint64_t x59, uint8_t _ = addcarryx_u64(0x0, x57, x48);
+ uint64_t _, uint8_t x63 = addcarryx_u64(0x0, x17, x41);
+ uint64_t x65, uint8_t x66 = addcarryx_u64(x63, x29, x50);
+ uint64_t x68, uint8_t x69 = addcarryx_u64(x66, x32, x53);
+ uint64_t x71, uint8_t x72 = addcarryx_u64(x69, x35, x56);
+ uint64_t x74, uint8_t x75 = addcarryx_u64(x72, x38, x59);
+ uint64_t x77, uint64_t x78 = mulx_u64(x7, x11);
+ uint64_t x80, uint64_t x81 = mulx_u64(x7, x13);
+ uint64_t x83, uint64_t x84 = mulx_u64(x7, x15);
+ uint64_t x86, uint64_t x87 = mulx_u64(x7, x14);
+ uint64_t x89, uint8_t x90 = addcarryx_u64(0x0, x78, x80);
+ uint64_t x92, uint8_t x93 = addcarryx_u64(x90, x81, x83);
+ uint64_t x95, uint8_t x96 = addcarryx_u64(x93, x84, x86);
+ uint64_t x98, uint8_t _ = addcarryx_u64(0x0, x96, x87);
+ uint64_t x101, uint8_t x102 = addcarryx_u64(0x0, x65, x77);
+ uint64_t x104, uint8_t x105 = addcarryx_u64(x102, x68, x89);
+ uint64_t x107, uint8_t x108 = addcarryx_u64(x105, x71, x92);
+ uint64_t x110, uint8_t x111 = addcarryx_u64(x108, x74, x95);
+ uint64_t x113, uint8_t x114 = addcarryx_u64(x111, x75, x98);
+ uint64_t x116, uint64_t x117 = mulx_u64(x101, 0xffffffffffffffffL);
+ uint64_t x119, uint64_t x120 = mulx_u64(x101, 0xffffffff);
+ uint64_t x122, uint64_t x123 = mulx_u64(x101, 0xffffffff00000001L);
+ uint64_t x125, uint8_t x126 = addcarryx_u64(0x0, x117, x119);
+ uint64_t x128, uint8_t x129 = addcarryx_u64(x126, x120, 0x0);
+ uint64_t x131, uint8_t x132 = addcarryx_u64(x129, 0x0, x122);
+ uint64_t x134, uint8_t _ = addcarryx_u64(0x0, x132, x123);
+ uint64_t _, uint8_t x138 = addcarryx_u64(0x0, x101, x116);
+ uint64_t x140, uint8_t x141 = addcarryx_u64(x138, x104, x125);
+ uint64_t x143, uint8_t x144 = addcarryx_u64(x141, x107, x128);
+ uint64_t x146, uint8_t x147 = addcarryx_u64(x144, x110, x131);
+ uint64_t x149, uint8_t x150 = addcarryx_u64(x147, x113, x134);
+ uint8_t x151 = x150 + x114;
+ uint64_t x153, uint64_t x154 = mulx_u64(x9, x11);
+ uint64_t x156, uint64_t x157 = mulx_u64(x9, x13);
+ uint64_t x159, uint64_t x160 = mulx_u64(x9, x15);
+ uint64_t x162, uint64_t x163 = mulx_u64(x9, x14);
+ uint64_t x165, uint8_t x166 = addcarryx_u64(0x0, x154, x156);
+ uint64_t x168, uint8_t x169 = addcarryx_u64(x166, x157, x159);
+ uint64_t x171, uint8_t x172 = addcarryx_u64(x169, x160, x162);
+ uint64_t x174, uint8_t _ = addcarryx_u64(0x0, x172, x163);
+ uint64_t x177, uint8_t x178 = addcarryx_u64(0x0, x140, x153);
+ uint64_t x180, uint8_t x181 = addcarryx_u64(x178, x143, x165);
+ uint64_t x183, uint8_t x184 = addcarryx_u64(x181, x146, x168);
+ uint64_t x186, uint8_t x187 = addcarryx_u64(x184, x149, x171);
+ uint64_t x189, uint8_t x190 = addcarryx_u64(x187, x151, x174);
+ uint64_t x192, uint64_t x193 = mulx_u64(x177, 0xffffffffffffffffL);
+ uint64_t x195, uint64_t x196 = mulx_u64(x177, 0xffffffff);
+ uint64_t x198, uint64_t x199 = mulx_u64(x177, 0xffffffff00000001L);
+ uint64_t x201, uint8_t x202 = addcarryx_u64(0x0, x193, x195);
+ uint64_t x204, uint8_t x205 = addcarryx_u64(x202, x196, 0x0);
+ uint64_t x207, uint8_t x208 = addcarryx_u64(x205, 0x0, x198);
+ uint64_t x210, uint8_t _ = addcarryx_u64(0x0, x208, x199);
+ uint64_t _, uint8_t x214 = addcarryx_u64(0x0, x177, x192);
+ uint64_t x216, uint8_t x217 = addcarryx_u64(x214, x180, x201);
+ uint64_t x219, uint8_t x220 = addcarryx_u64(x217, x183, x204);
+ uint64_t x222, uint8_t x223 = addcarryx_u64(x220, x186, x207);
+ uint64_t x225, uint8_t x226 = addcarryx_u64(x223, x189, x210);
+ uint8_t x227 = x226 + x190;
+ uint64_t x229, uint64_t x230 = mulx_u64(x8, x11);
+ uint64_t x232, uint64_t x233 = mulx_u64(x8, x13);
+ uint64_t x235, uint64_t x236 = mulx_u64(x8, x15);
+ uint64_t x238, uint64_t x239 = mulx_u64(x8, x14);
+ uint64_t x241, uint8_t x242 = addcarryx_u64(0x0, x230, x232);
+ uint64_t x244, uint8_t x245 = addcarryx_u64(x242, x233, x235);
+ uint64_t x247, uint8_t x248 = addcarryx_u64(x245, x236, x238);
+ uint64_t x250, uint8_t _ = addcarryx_u64(0x0, x248, x239);
+ uint64_t x253, uint8_t x254 = addcarryx_u64(0x0, x216, x229);
+ uint64_t x256, uint8_t x257 = addcarryx_u64(x254, x219, x241);
+ uint64_t x259, uint8_t x260 = addcarryx_u64(x257, x222, x244);
+ uint64_t x262, uint8_t x263 = addcarryx_u64(x260, x225, x247);
+ uint64_t x265, uint8_t x266 = addcarryx_u64(x263, x227, x250);
+ uint64_t x268, uint64_t x269 = mulx_u64(x253, 0xffffffffffffffffL);
+ uint64_t x271, uint64_t x272 = mulx_u64(x253, 0xffffffff);
+ uint64_t x274, uint64_t x275 = mulx_u64(x253, 0xffffffff00000001L);
+ uint64_t x277, uint8_t x278 = addcarryx_u64(0x0, x269, x271);
+ uint64_t x280, uint8_t x281 = addcarryx_u64(x278, x272, 0x0);
+ uint64_t x283, uint8_t x284 = addcarryx_u64(x281, 0x0, x274);
+ uint64_t x286, uint8_t _ = addcarryx_u64(0x0, x284, x275);
+ uint64_t _, uint8_t x290 = addcarryx_u64(0x0, x253, x268);
+ uint64_t x292, uint8_t x293 = addcarryx_u64(x290, x256, x277);
+ uint64_t x295, uint8_t x296 = addcarryx_u64(x293, x259, x280);
+ uint64_t x298, uint8_t x299 = addcarryx_u64(x296, x262, x283);
+ uint64_t x301, uint8_t x302 = addcarryx_u64(x299, x265, x286);
+ uint8_t x303 = x302 + x266;
+ uint64_t x304 = (uint64_t) (x303 == 0 ? 0x0 : 0xffffffffffffffffL);
+ uint64_t x305 = x304 & 0xffffffffffffffffL;
+ uint64_t x307, uint8_t x308 = subborrow_u64(0x0, x292, x305);
+ uint64_t x309 = x304 & 0xffffffff;
+ uint64_t x311, uint8_t x312 = subborrow_u64(x308, x295, x309);
+ uint64_t x314, uint8_t x315 = subborrow_u64(x312, x298, 0x0);
+ uint64_t x316 = x304 & 0xffffffff00000001L;
+ uint64_t x318, uint8_t _ = subborrow_u64(x315, x301, x316);
+ (Return x318, Return x314, Return x311, Return x307))
+(x, x0)%core
+ : word64 * word64 * word64 * word64 → word64 * word64 * word64 * word64 → ReturnType (uint64_t * uint64_t * uint64_t * uint64_t)
diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v
new file mode 100644
index 000000000..590879c0c
--- /dev/null
+++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v
@@ -0,0 +1,4 @@
+Require Import Crypto.Specific.NISTP256.AMD64.IntegrationTestMontgomeryP256.
+Require Import Crypto.Specific.IntegrationTestDisplayCommon.
+
+Check display mul.
diff --git a/src/Specific/NISTP256/AMD64/MontgomeryP256.v b/src/Specific/NISTP256/AMD64/MontgomeryP256.v
new file mode 100644
index 000000000..dfc57cfdc
--- /dev/null
+++ b/src/Specific/NISTP256/AMD64/MontgomeryP256.v
@@ -0,0 +1,157 @@
+Require Import Coq.micromega.Lia.
+Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Definition.
+Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Proofs.
+Require Import Crypto.Arithmetic.Core. Import B.
+Require Import Crypto.Util.Sigma.Lift.
+Require Import Coq.ZArith.BinInt.
+Require Import Coq.PArith.BinPos.
+Require Import Crypto.Util.LetIn.
+Require Import Crypto.Util.ZUtil.ModInv.
+Require Import Crypto.Util.Tactics.DestructHead.
+
+Definition wt (i:nat) : Z := Z.shiftl 1 (64*Z.of_nat i).
+Definition r := Eval compute in (2^64)%positive.
+Definition sz := 4%nat.
+Definition m : positive := 2^256-2^224+2^192+2^96-1.
+Definition p256 :=
+ Eval vm_compute in
+ ((Positional.encode (modulo:=modulo) (div:=div) (n:=sz) wt (Z.pos m))).
+Definition r' := Eval vm_compute in Zpow_facts.Zpow_mod (Z.pos r) (Z.pos m - 2) (Z.pos m).
+Definition r'_correct := eq_refl : ((Z.pos r * r') mod (Z.pos m) = 1)%Z.
+Definition m' : Z := Eval vm_compute in Option.invert_Some (Z.modinv_fueled 10 (-Z.pos m) (Z.pos r)).
+Definition m'_correct := eq_refl : ((Z.pos m * m') mod (Z.pos r) = (-1) mod Z.pos r)%Z.
+
+Definition mulmod_256' : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz
+ | forall (A B : Tuple.tuple Z sz),
+ f A B =
+ (redc (r:=r)(R_numlimbs:=sz) p256 A B m')
+ }.
+Proof.
+ eapply (lift2_sig (fun A B c => c = _)); eexists.
+ cbv -[Definitions.Z.add_get_carry Definitions.Z.mul_split_at_bitwidth Definitions.Z.zselect runtime_add runtime_mul runtime_and runtime_opp Let_In].
+ (*
+ cbv [
+ r wt sz p256
+ CPSUtil.Tuple.tl_cps CPSUtil.Tuple.hd_cps
+ CPSUtil.to_list_cps CPSUtil.to_list'_cps CPSUtil.to_list_cps' CPSUtil.flat_map_cps CPSUtil.fold_right_cps
+ CPSUtil.map_cps CPSUtil.Tuple.left_append_cps CPSUtil.firstn_cps CPSUtil.combine_cps CPSUtil.on_tuple_cps CPSUtil.update_nth_cps CPSUtil.from_list_default_cps CPSUtil.from_list_default'_cps
+ fst snd length List.seq List.hd List.app
+ redc redc_cps redc_loop_cps redc_body_cps
+ Positional.to_associational_cps
+ Saturated.divmod_cps
+ Saturated.scmul_cps
+ Saturated.uweight
+ Saturated.Columns.mul_cps
+ Saturated.Associational.mul_cps
+ (*Z.of_nat Pos.of_succ_nat Nat.pred
+ Z.pow Z.pow_pos Z.mul Pos.iter Pos.mul Pos.succ*)
+ Tuple.hd Tuple.append Tuple.tl Tuple.hd' Tuple.tl' CPSUtil.Tuple.left_tl_cps CPSUtil.Tuple.left_hd_cps CPSUtil.Tuple.hd_cps CPSUtil.Tuple.tl_cps
+ Saturated.Columns.from_associational_cps
+ Saturated.Associational.multerm_cps
+ Saturated.Columns.compact_cps
+ Saturated.Columns.compact_step_cps
+ Saturated.Columns.compact_digit_cps
+ Saturated.drop_high_cps
+ Saturated.add_cps
+ Saturated.Columns.add_cps
+ Saturated.Columns.cons_to_nth_cps
+ Nat.pred
+ Saturated.join0
+ Saturated.join0_cps CPSUtil.Tuple.left_append_cps
+ CPSUtil.Tuple.mapi_with_cps
+ id
+ Positional.zeros Saturated.zero Saturated.Columns.nils Tuple.repeat Tuple.append
+ Positional.place_cps
+ CPSUtil.Tuple.mapi_with'_cps Tuple.hd Tuple.hd' Tuple.tl Tuple.tl' CPSUtil.Tuple.hd_cps CPSUtil.Tuple.tl_cps fst snd
+ Z.of_nat fst snd Z.pow Z.pow_pos Pos.of_succ_nat Z.div Z.mul Pos.mul Z.modulo Z.div_eucl Z.pos_div_eucl Z.leb Z.compare Pos.compare_cont Pos.compare Z.pow_pos Pos.iter Z.mul Pos.succ Z.ltb Z.gtb Z.geb Z.div Z.sub Z.pos_sub Z.add Z.opp Z.double
+ Decidable.dec Decidable.dec_eq_Z Z.eq_dec Z_rec Z_rec Z_rect
+ Positional.zeros Saturated.zero Saturated.Columns.nils Tuple.repeat Tuple.append
+ (*
+ Saturated.Associational.multerm_cps
+ Saturated.Columns.from_associational_cps Positional.place_cps Saturated.Columns.cons_to_nth_cps Saturated.Columns.nils
+Tuple.append
+Tuple.from_list Tuple.from_list'
+Tuple.from_list_default Tuple.from_list_default'
+Tuple.hd
+Tuple.repeat
+Tuple.tl
+Tuple.tuple *)
+ (* Saturated.add_cps Saturated.divmod_cps Saturated.drop_high_cps Saturated.scmul_cps Saturated.zero Saturated.Columns.mul_cps Saturated.Columns.add_cps Saturated.uweight Saturated.T *)
+ (*
+ CPSUtil.to_list_cps CPSUtil.to_list'_cps CPSUtil.to_list_cps'
+Positional.zeros
+Tuple.to_list
+Tuple.to_list'
+List.hd
+List.tl
+Nat.max
+Positional.to_associational_cps
+Z.of_nat *)
+ ].
+ Unset Printing Notations.
+
+ (* cbv -[runtime_add runtime_mul LetIn.Let_In Definitions.Z.add_get_carry_full Definitions.Z.mul_split]. *)
+
+ (* basesystem_partial_evaluation_RHS. *)
+ *)
+ reflexivity.
+Defined.
+
+
+Definition mulmod_256'' : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz
+ | forall (A B : Tuple.tuple Z sz),
+ Saturated.small (Z.pos r) A -> Saturated.small (Z.pos r) B ->
+ let eval := Saturated.eval (Z.pos r) in
+ (Saturated.small (Z.pos r) (f A B)
+ /\ (eval B < eval p256 -> 0 <= eval (f A B) < eval p256)
+ /\ (eval (f A B) mod Z.pos m
+ = (eval A * eval B * r'^(Z.of_nat sz)) mod Z.pos m))%Z
+ }.
+Proof.
+ exists (proj1_sig mulmod_256').
+ abstract (
+ intros; rewrite (proj2_sig mulmod_256');
+ split; [ | split ];
+ [ apply small_redc with (ri:=r') | apply redc_bound_N with (ri:=r') | apply redc_mod_N ];
+ try match goal with
+ | _ => assumption
+ | [ |- _ = _ :> Z ] => vm_compute; reflexivity
+ | _ => reflexivity
+ | [ |- Saturated.small (Z.pos r) p256 ]
+ => hnf; cbv [Tuple.to_list sz p256 r Tuple.to_list' List.In]; intros; destruct_head'_or;
+ subst; try lia
+ | [ |- Saturated.eval (Z.pos r) p256 <> 0%Z ]
+ => vm_compute; lia
+ end
+ ).
+Defined.
+
+Import ModularArithmetic.
+
+(** TODO: MOVE ME *)
+Definition montgomery_to_F (v : Z) : F m
+ := (F.of_Z m v * F.of_Z m (r'^Z.of_nat sz)%Z)%F.
+
+Definition mulmod_256 : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz
+ | forall (A : Tuple.tuple Z sz) (_ : Saturated.small (Z.pos r) A)
+ (B : Tuple.tuple Z sz) (_ : Saturated.small (Z.pos r) B),
+ let eval := Saturated.eval (Z.pos r) in
+ montgomery_to_F (eval (f A B))
+ = (montgomery_to_F (eval A) * montgomery_to_F (eval B))%F }.
+Proof.
+ exists (proj1_sig mulmod_256'').
+ abstract (
+ intros A Asm B Bsm;
+ pose proof (proj2_sig mulmod_256'' A B Asm Bsm) as H;
+ cbv zeta in *;
+ (*split; try solve [ destruct_head'_and; assumption ];*)
+ rewrite ModularArithmeticTheorems.F.eq_of_Z_iff in H;
+ unfold montgomery_to_F;
+ destruct H as [H1 [H2 H3]];
+ rewrite H3;
+ rewrite <- !ModularArithmeticTheorems.F.of_Z_mul;
+ f_equal; nia
+ ).
+Defined.
+
+Print Assumptions mulmod_256.