diff options
author | 2017-06-22 17:07:16 -0400 | |
---|---|---|
committer | 2017-06-22 17:07:22 -0400 | |
commit | 3bc1dda17334225a2970fd31ef1c92862799b9a1 (patch) | |
tree | d29522cda123cb741602211bab2a385833fc1cc1 /src/Specific/NISTP256/AMD64 | |
parent | e8eba53e187e26f28870e9070360810dbdcbe6b8 (diff) |
move Specifi p256 files into their own directory
Diffstat (limited to 'src/Specific/NISTP256/AMD64')
5 files changed, 622 insertions, 0 deletions
diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s new file mode 100644 index 000000000..00472462f --- /dev/null +++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.s @@ -0,0 +1,240 @@ +.intel_syntax noprefix +.section .text +.globl IntegrationTestMontgomeryP256 +.type IntegrationTestMontgomeryP256, @function +IntegrationTestMontgomeryP256: +#icc17 -m64 -O3 -madx -march=skylake -mtune=skylake -fomit-frame-pointer -fno-stack-protector-all -opt-report +#IntegrationTestMontgomeryP256(unsigned long*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long): + push r12 #13.1 + push r13 #13.1 + push r14 #13.1 + push r15 #13.1 + push rbx #13.1 + push rbp #13.1 + mov rbx, r9 #13.1 + mov QWORD PTR [-32+rsp], rdx #13.1[spill] + mov rdx, r8 #13.33 + mov QWORD PTR [-16+rsp], rdi #13.1[spill] + mov r11, -1 #21.33 + xor edi, edi #17.32 + mov r9, QWORD PTR [72+rsp] #13.1 + mulx rax, rbp, QWORD PTR [64+rsp] #14.33 + mulx r15, r10, r9 #13.33 + adcx r10, rax #17.32 + mov rax, 0x0ffffffff #22.33 + mov QWORD PTR [-24+rsp], rsi #13.1[spill] + mulx r14, rsi, QWORD PTR [56+rsp] #15.33 + adcx rbp, r14 #18.32 + mulx r8, r13, rbx #16.33 + mov rdx, r15 #21.33 + adcx rsi, r8 #19.32 + mov QWORD PTR [-40+rsp], rbx #13.1[spill] + mov ebx, 0 #19.32 + mov r8d, ebx #20.30 + setb dil #19.32 + adox r8d, ebx #20.30 + mov r8d, ebx #20.30 + adox rdi, r13 #20.30 + mulx r12, r11, r11 #21.33 + seto r8b #20.30 + clc #24.32 + mulx r8, r13, rax #22.33 + mov rax, 0xffffffff00000001 #23.33 + adcx r11, r8 #24.32 + mov r8d, 0 #25.32 + mulx r14, rax, rax #23.33 + mov edx, ebx #27.30 + adcx r13, r8 #25.32 + adcx r14, r8 #26.32 + mov r8d, ebx #26.32 + setb r8b #26.32 + adox edx, ebx #27.30 + mov rdx, rcx #33.33 + adox r8, rax #27.30 + mov ecx, ebx #37.32 + clc #28.30 + adcx r15, r12 #28.30 + mulx r12, rax, r9 #33.33 + adcx r10, r11 #29.32 + adcx rbp, r13 #30.32 + mulx r13, r15, QWORD PTR [56+rsp] #35.33 + adcx rsi, r14 #31.32 + mulx r14, r9, QWORD PTR [64+rsp] #34.33 + adcx rdi, r8 #32.32 + mov r8d, ebx #32.32 + mulx rdx, r11, QWORD PTR [-40+rsp] #36.33[spill] + setb r8b #32.32 + adox ecx, ebx #37.32 + mov rcx, -1 #46.35 + adox rax, r14 #37.32 + mov r14d, ebx #39.32 + adox r9, r13 #38.32 + adox r15, rdx #39.32 + seto r14b #39.32 + clc #40.30 + adcx r14, r11 #40.30 + mov r11d, ebx #41.34 + adox r11d, ebx #41.34 + mov r11d, ebx #45.34 + adox r10, r12 #41.34 + mov rdx, r10 #46.35 + adox rbp, rax #42.34 + mov rax, 0x0ffffffff #47.35 + mulx r12, rax, rax #47.35 + adox rsi, r9 #43.34 + mov r9, 0xffffffff00000001 #48.35 + adox rdi, r15 #44.34 + adox r8, r14 #45.34 + mulx r14, r13, rcx #46.35 + seto r11b #45.34 + clc #49.34 + mulx r15, rcx, r9 #48.35 + mov r9d, ebx #52.31 + adcx r13, r12 #49.34 + mov r12d, 0 #50.34 + mov rdx, QWORD PTR [-32+rsp] #59.35[spill] + adcx rax, r12 #50.34 + adcx r15, r12 #51.34 + mov r12d, ebx #51.34 + setb r12b #51.34 + adox r9d, ebx #52.31 + adox r12, rcx #52.31 + clc #53.31 + adcx r10, r14 #53.31 + mov r10d, ebx #57.34 + adcx rbp, r13 #54.34 + mulx r14, r13, QWORD PTR [72+rsp] #59.35 + adcx rsi, rax #55.34 + mulx rax, r9, QWORD PTR [56+rsp] #61.35 + adcx rdi, r15 #56.34 + mov r15d, ebx #63.34 + adcx r8, r12 #57.34 + setb r10b #57.34 + adox r15d, ebx #63.34 + mov DWORD PTR [-8+rsp], r10d #57.34[spill] + mulx r10, r12, QWORD PTR [64+rsp] #60.35 + adox r13, r10 #63.34 + mov r10d, ebx #67.34 + mulx rdx, rcx, QWORD PTR [-40+rsp] #62.35[spill] + adox r12, rax #64.34 + mov eax, ebx #65.34 + adox r9, rdx #65.34 + seto al #65.34 + clc #66.31 + adcx rax, rcx #66.31 + adox r10d, ebx #67.34 + mov ecx, DWORD PTR [-8+rsp] #71.34[spill] + adox rbp, r14 #67.34 + mov r14d, ebx #70.34 + mov rdx, rbp #72.35 + adox rsi, r13 #68.34 + adox rdi, r12 #69.34 + adox r8, r9 #70.34 + mov r9, 0xffffffff00000001 #74.35 + mulx r10, r9, r9 #74.35 + seto r14b #70.34 + xor r15d, r15d #70.34 + add ecx, r11d #71.34 + cmp ebx, r14d #71.34 + mov r11, 0x0ffffffff #73.35 + mulx r11, r14, r11 #73.35 + adcx rcx, rax #71.34 + mov rax, -1 #72.35 + mulx r13, r12, rax #72.35 + mov edx, ebx #75.34 + setb r15b #71.34 + adox edx, ebx #75.34 + mov rdx, QWORD PTR [-24+rsp] #85.35[spill] + adox r12, r11 #75.34 + mov r11d, 0 #76.34 + adox r14, r11 #76.34 + adox r10, r11 #77.34 + mov r11d, ebx #77.34 + seto r11b #77.34 + clc #78.31 + adcx r11, r9 #78.31 + mov r9d, ebx #79.31 + adox r9d, ebx #79.31 + adox rbp, r13 #79.31 + mulx r13, r9, QWORD PTR [64+rsp] #86.35 + adox rsi, r12 #80.34 + adox rdi, r14 #81.34 + adox r8, r10 #82.34 + mov r10d, ebx #83.34 + adox rcx, r11 #83.34 + mulx r11, rbp, QWORD PTR [72+rsp] #85.35 + seto r10b #83.34 + clc #89.34 + adcx rbp, r13 #89.34 + mulx r14, r13, QWORD PTR [56+rsp] #87.35 + adcx r9, r14 #90.34 + mulx r14, r12, QWORD PTR [-40+rsp] #88.35[spill] + mov edx, ebx #92.31 + adcx r13, r14 #91.34 + mov r14d, ebx #91.34 + setb r14b #91.34 + adox edx, ebx #92.31 + adox r14, r12 #92.31 + mov r12d, ebx #92.31 + seto r12b #92.31 + clc #93.34 + adcx rsi, r11 #93.34 + mov r11, 0x0ffffffff #99.35 + mov rdx, rsi #98.35 + adcx rdi, rbp #94.34 + mov ebp, ebx #96.34 + adcx r8, r9 #95.34 + mulx r9, r11, r11 #99.35 + adcx rcx, r13 #96.34 + mulx r12, r13, rax #98.35 + setb bpl #96.34 + add r10d, r15d #97.34 + cmp ebx, ebp #97.34 + mov rbp, 0xffffffff00000001 #100.35 + mulx rbp, r15, rbp #100.35 + mov edx, ebx #101.34 + adcx r10, r14 #97.34 + mov r14d, ebx #97.34 + setb r14b #97.34 + adox edx, ebx #101.34 + mov edx, 0 #102.34 + adox r13, r9 #101.34 + mov r9d, ebx #103.34 + adox r11, rdx #102.34 + adox rbp, rdx #103.34 + seto r9b #103.34 + clc #104.31 + adcx r9, r15 #104.31 + mov r15d, ebx #105.31 + adox r15d, ebx #105.31 + adox rsi, r12 #105.31 + mov esi, ebx #109.34 + adox rdi, r13 #106.34 + adox r8, r11 #107.34 + adox rcx, rbp #108.34 + adox r10, r9 #109.34 + seto sil #109.34 + xor ebp, ebp #109.34 + add esi, r14d #110.26 + mov rsi, QWORD PTR [-16+rsp] #122.1[spill] + cmove rax, rdx #111.40 + sub rdi, rax #113.34 + mov QWORD PTR [24+rsi], rdi #122.1 + mov edi, eax #115.34 + sbb r8, rdi #115.34 + mov QWORD PTR [16+rsi], r8 #121.1 + sbb rcx, rdx #116.34 + mov rdx, 0xffffffff00000001 #118.31 + mov QWORD PTR [8+rsi], rcx #120.1 + setb bpl #116.34 + and rdx, rax #118.31 + cmp ebx, ebp #118.31 + sbb r10, rdx #118.31 + mov QWORD PTR [rsi], r10 #119.1 + pop rbp #123.106 + pop rbx #123.106 + pop r15 #123.106 + pop r14 #123.106 + pop r13 #123.106 + pop r12 #123.106 + ret #123.106 diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v new file mode 100644 index 000000000..313b066da --- /dev/null +++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256.v @@ -0,0 +1,108 @@ +Require Import Coq.ZArith.ZArith. +Require Import Coq.Lists.List. +Local Open Scope Z_scope. + +Require Import Crypto.Arithmetic.Core. +Require Import Crypto.Util.FixedWordSizes. +Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Definition. +Require Import Crypto.Arithmetic.Core. Import B. +Require Import Crypto.Specific.NISTP256.AMD64.MontgomeryP256. +Require Import Crypto.Arithmetic.PrimeFieldTheorems. +Require Import Crypto.Util.Tuple Crypto.Util.Sigma Crypto.Util.Sigma.MapProjections Crypto.Util.Sigma.Lift Crypto.Util.Notations Crypto.Util.ZRange Crypto.Util.BoundedWord. +Require Import Crypto.Util.Tactics.Head. +Require Import Crypto.Util.Tactics.MoveLetIn. +Require Import Crypto.Util.Tactics.DestructHead. +Import ListNotations. + +Require Import Crypto.Specific.IntegrationTestTemporaryMiscCommon. + +Require Import Crypto.Compilers.Z.Bounds.Pipeline. + +Section BoundedField25p5. + Local Coercion Z.of_nat : nat >-> Z. + + Let bounds : Tuple.tuple zrange sz + := Eval compute in + Tuple.repeat {| lower := 0 ; upper := r-1 |} sz. + + Let lgbitwidth := Eval compute in (Z.to_nat (Z.log2_up (Z.log2_up r))). + Let bitwidth := Eval compute in (2^lgbitwidth)%nat. + Let feZ : Type := tuple Z sz. + Let feW : Type := tuple (wordT lgbitwidth) sz. + Let feBW : Type := BoundedWord sz bitwidth bounds. + Let phi : feBW -> F m := + fun x => montgomery_to_F (Saturated.eval (Z.pos r) (BoundedWordToZ _ _ _ x)). + + (* TODO : change this to field once field isomorphism happens *) + Definition mul + : { mul : feBW -> feBW -> feBW + | forall A B, phi (mul A B) = F.mul (phi A) (phi B) }. + Proof. + lazymatch goal with + | [ |- { f | forall a b, ?phi (f a b) = @?rhs a b } ] + => apply lift2_sig with (P:=fun a b f => phi f = rhs a b) + end. + intros a b. + eexists_sig_etransitivity. all:cbv [phi]. + rewrite <- (proj2_sig mulmod_256) + by (hnf; rewrite <- (is_bounded_by_None_repeat_In_iff_lt _ _ _); destruct_head' feBW; assumption). + (*symmetry; rewrite <- (proj2_sig carry_sig); symmetry. + set (carry_mulZ := fun a b => proj1_sig carry_sig (proj1_sig mul_sig a b)). + change (proj1_sig carry_sig (proj1_sig mul_sig ?a ?b)) with (carry_mulZ a b).*) + set (mulmodZ := proj1_sig mulmod_256). + context_to_dlet_in_rhs mulmodZ; cbv [mulmodZ]. + cbv beta iota delta [mulmod_256 mulmod_256'' mulmod_256' proj1_sig Saturated.T lift2_sig fst snd runtime_add runtime_and runtime_mul runtime_opp runtime_shr sz]. + reflexivity. + sig_dlet_in_rhs_to_context. + do 2 apply (fun f => proj2_sig_map (fun THIS_NAME_MUST_NOT_BE_UNDERSCORE_TO_WORK_AROUND_CONSTR_MATCHING_ANAOMLIES___BUT_NOTE_THAT_IF_THIS_NAME_IS_LOWERCASE_A___THEN_REIFICATION_STACK_OVERFLOWS___AND_I_HAVE_NO_IDEA_WHATS_GOING_ON p => f_equal f p)). + (* jgross start here! *) + Set Ltac Profiling. + Time refine_reflectively_with_uint8_with anf. (* Finished transaction in 212.693 secs (212.576u,0.184s) (successful) *) + Show Ltac Profile. + (* total time: 19.632s + + tactic local total calls max +────────────────────────────────────────┴──────┴──────┴───────┴─────────┘ +─refine_reflectively ------------------- 0.0% 98.4% 1 19.320s +─ReflectiveTactics.do_reflective_pipelin -0.0% 96.2% 1 18.884s +─ReflectiveTactics.solve_side_conditions 1.2% 96.1% 1 18.860s +─ReflectiveTactics.do_reify ------------ 27.7% 44.0% 1 8.640s +─ReflectiveTactics.abstract_vm_compute_r 12.3% 13.9% 2 2.024s +─ReflectiveTactics.abstract_vm_compute_r 8.9% 12.2% 2 1.576s +─Reify_rhs_gen ------------------------- 0.8% 11.7% 1 2.300s +─ReflectiveTactics.renamify_rhs -------- 10.4% 11.5% 1 2.260s +─ReflectiveTactics.abstract_rhs -------- 4.6% 5.8% 1 1.148s +─clear (var_list) ---------------------- 5.2% 5.2% 57 0.184s +─eexact -------------------------------- 4.1% 4.1% 68 0.028s +─prove_interp_compile_correct ---------- 0.0% 3.4% 1 0.664s +─ReflectiveTactics.abstract_cbv_interp_r 2.7% 3.3% 1 0.648s +─unify (constr) (constr) --------------- 3.2% 3.2% 6 0.248s +─rewrite ?EtaInterp.InterpExprEta ------ 3.1% 3.1% 1 0.612s +─ReflectiveTactics.abstract_cbv_rhs ---- 2.0% 2.7% 1 0.532s +─Glue.refine_to_reflective_glue -------- 0.0% 2.2% 1 0.436s +─rewrite H ----------------------------- 2.1% 2.1% 1 0.420s + + tactic local total calls max +────────────────────────────────────────┴──────┴──────┴───────┴─────────┘ +─refine_reflectively ------------------- 0.0% 98.4% 1 19.320s + ├─ReflectiveTactics.do_reflective_pipel -0.0% 96.2% 1 18.884s + │└ReflectiveTactics.solve_side_conditio 1.2% 96.1% 1 18.860s + │ ├─ReflectiveTactics.do_reify -------- 27.7% 44.0% 1 8.640s + │ │ ├─Reify_rhs_gen ------------------- 0.8% 11.7% 1 2.300s + │ │ │ ├─prove_interp_compile_correct -- 0.0% 3.4% 1 0.664s + │ │ │ │└rewrite ?EtaInterp.InterpExprEt 3.1% 3.1% 1 0.612s + │ │ │ └─rewrite H --------------------- 2.1% 2.1% 1 0.420s + │ │ └─eexact -------------------------- 4.1% 4.1% 68 0.028s + │ ├─ReflectiveTactics.abstract_vm_compu 12.3% 13.9% 2 2.024s + │ ├─ReflectiveTactics.abstract_vm_compu 8.9% 12.2% 2 1.576s + │ ├─ReflectiveTactics.renamify_rhs ---- 10.4% 11.5% 1 2.260s + │ ├─ReflectiveTactics.abstract_rhs ---- 4.6% 5.8% 1 1.148s + │ ├─ReflectiveTactics.abstract_cbv_inte 2.7% 3.3% 1 0.648s + │ └─ReflectiveTactics.abstract_cbv_rhs 2.0% 2.7% 1 0.532s + └─Glue.refine_to_reflective_glue ------ 0.0% 2.2% 1 0.436s +*) + Time Defined. (* Finished transaction in 21.291 secs (21.231u,0.032s) (successful) *) + +Time End BoundedField25p5. (* Finished transaction in 14.666 secs (14.556u,0.111s) (successful) *) + +Print Assumptions mul. diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log new file mode 100644 index 000000000..48cc4785d --- /dev/null +++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.log @@ -0,0 +1,113 @@ +λ x x0 : word64 * word64 * word64 * word64, +Interp-η +(λ var : Syntax.base_type → Type, + λ '(x8, x9, x7, x5, (x14, x15, x13, x11))%core, + uint64_t x17, uint64_t x18 = mulx_u64(x5, x11); + uint64_t x20, uint64_t x21 = mulx_u64(x5, x13); + uint64_t x23, uint64_t x24 = mulx_u64(x5, x15); + uint64_t x26, uint64_t x27 = mulx_u64(x5, x14); + uint64_t x29, uint8_t x30 = addcarryx_u64(0x0, x18, x20); + uint64_t x32, uint8_t x33 = addcarryx_u64(x30, x21, x23); + uint64_t x35, uint8_t x36 = addcarryx_u64(x33, x24, x26); + uint64_t x38, uint8_t _ = addcarryx_u64(0x0, x36, x27); + uint64_t x41, uint64_t x42 = mulx_u64(x17, 0xffffffffffffffffL); + uint64_t x44, uint64_t x45 = mulx_u64(x17, 0xffffffff); + uint64_t x47, uint64_t x48 = mulx_u64(x17, 0xffffffff00000001L); + uint64_t x50, uint8_t x51 = addcarryx_u64(0x0, x42, x44); + uint64_t x53, uint8_t x54 = addcarryx_u64(x51, x45, 0x0); + uint64_t x56, uint8_t x57 = addcarryx_u64(x54, 0x0, x47); + uint64_t x59, uint8_t _ = addcarryx_u64(0x0, x57, x48); + uint64_t _, uint8_t x63 = addcarryx_u64(0x0, x17, x41); + uint64_t x65, uint8_t x66 = addcarryx_u64(x63, x29, x50); + uint64_t x68, uint8_t x69 = addcarryx_u64(x66, x32, x53); + uint64_t x71, uint8_t x72 = addcarryx_u64(x69, x35, x56); + uint64_t x74, uint8_t x75 = addcarryx_u64(x72, x38, x59); + uint64_t x77, uint64_t x78 = mulx_u64(x7, x11); + uint64_t x80, uint64_t x81 = mulx_u64(x7, x13); + uint64_t x83, uint64_t x84 = mulx_u64(x7, x15); + uint64_t x86, uint64_t x87 = mulx_u64(x7, x14); + uint64_t x89, uint8_t x90 = addcarryx_u64(0x0, x78, x80); + uint64_t x92, uint8_t x93 = addcarryx_u64(x90, x81, x83); + uint64_t x95, uint8_t x96 = addcarryx_u64(x93, x84, x86); + uint64_t x98, uint8_t _ = addcarryx_u64(0x0, x96, x87); + uint64_t x101, uint8_t x102 = addcarryx_u64(0x0, x65, x77); + uint64_t x104, uint8_t x105 = addcarryx_u64(x102, x68, x89); + uint64_t x107, uint8_t x108 = addcarryx_u64(x105, x71, x92); + uint64_t x110, uint8_t x111 = addcarryx_u64(x108, x74, x95); + uint64_t x113, uint8_t x114 = addcarryx_u64(x111, x75, x98); + uint64_t x116, uint64_t x117 = mulx_u64(x101, 0xffffffffffffffffL); + uint64_t x119, uint64_t x120 = mulx_u64(x101, 0xffffffff); + uint64_t x122, uint64_t x123 = mulx_u64(x101, 0xffffffff00000001L); + uint64_t x125, uint8_t x126 = addcarryx_u64(0x0, x117, x119); + uint64_t x128, uint8_t x129 = addcarryx_u64(x126, x120, 0x0); + uint64_t x131, uint8_t x132 = addcarryx_u64(x129, 0x0, x122); + uint64_t x134, uint8_t _ = addcarryx_u64(0x0, x132, x123); + uint64_t _, uint8_t x138 = addcarryx_u64(0x0, x101, x116); + uint64_t x140, uint8_t x141 = addcarryx_u64(x138, x104, x125); + uint64_t x143, uint8_t x144 = addcarryx_u64(x141, x107, x128); + uint64_t x146, uint8_t x147 = addcarryx_u64(x144, x110, x131); + uint64_t x149, uint8_t x150 = addcarryx_u64(x147, x113, x134); + uint8_t x151 = x150 + x114; + uint64_t x153, uint64_t x154 = mulx_u64(x9, x11); + uint64_t x156, uint64_t x157 = mulx_u64(x9, x13); + uint64_t x159, uint64_t x160 = mulx_u64(x9, x15); + uint64_t x162, uint64_t x163 = mulx_u64(x9, x14); + uint64_t x165, uint8_t x166 = addcarryx_u64(0x0, x154, x156); + uint64_t x168, uint8_t x169 = addcarryx_u64(x166, x157, x159); + uint64_t x171, uint8_t x172 = addcarryx_u64(x169, x160, x162); + uint64_t x174, uint8_t _ = addcarryx_u64(0x0, x172, x163); + uint64_t x177, uint8_t x178 = addcarryx_u64(0x0, x140, x153); + uint64_t x180, uint8_t x181 = addcarryx_u64(x178, x143, x165); + uint64_t x183, uint8_t x184 = addcarryx_u64(x181, x146, x168); + uint64_t x186, uint8_t x187 = addcarryx_u64(x184, x149, x171); + uint64_t x189, uint8_t x190 = addcarryx_u64(x187, x151, x174); + uint64_t x192, uint64_t x193 = mulx_u64(x177, 0xffffffffffffffffL); + uint64_t x195, uint64_t x196 = mulx_u64(x177, 0xffffffff); + uint64_t x198, uint64_t x199 = mulx_u64(x177, 0xffffffff00000001L); + uint64_t x201, uint8_t x202 = addcarryx_u64(0x0, x193, x195); + uint64_t x204, uint8_t x205 = addcarryx_u64(x202, x196, 0x0); + uint64_t x207, uint8_t x208 = addcarryx_u64(x205, 0x0, x198); + uint64_t x210, uint8_t _ = addcarryx_u64(0x0, x208, x199); + uint64_t _, uint8_t x214 = addcarryx_u64(0x0, x177, x192); + uint64_t x216, uint8_t x217 = addcarryx_u64(x214, x180, x201); + uint64_t x219, uint8_t x220 = addcarryx_u64(x217, x183, x204); + uint64_t x222, uint8_t x223 = addcarryx_u64(x220, x186, x207); + uint64_t x225, uint8_t x226 = addcarryx_u64(x223, x189, x210); + uint8_t x227 = x226 + x190; + uint64_t x229, uint64_t x230 = mulx_u64(x8, x11); + uint64_t x232, uint64_t x233 = mulx_u64(x8, x13); + uint64_t x235, uint64_t x236 = mulx_u64(x8, x15); + uint64_t x238, uint64_t x239 = mulx_u64(x8, x14); + uint64_t x241, uint8_t x242 = addcarryx_u64(0x0, x230, x232); + uint64_t x244, uint8_t x245 = addcarryx_u64(x242, x233, x235); + uint64_t x247, uint8_t x248 = addcarryx_u64(x245, x236, x238); + uint64_t x250, uint8_t _ = addcarryx_u64(0x0, x248, x239); + uint64_t x253, uint8_t x254 = addcarryx_u64(0x0, x216, x229); + uint64_t x256, uint8_t x257 = addcarryx_u64(x254, x219, x241); + uint64_t x259, uint8_t x260 = addcarryx_u64(x257, x222, x244); + uint64_t x262, uint8_t x263 = addcarryx_u64(x260, x225, x247); + uint64_t x265, uint8_t x266 = addcarryx_u64(x263, x227, x250); + uint64_t x268, uint64_t x269 = mulx_u64(x253, 0xffffffffffffffffL); + uint64_t x271, uint64_t x272 = mulx_u64(x253, 0xffffffff); + uint64_t x274, uint64_t x275 = mulx_u64(x253, 0xffffffff00000001L); + uint64_t x277, uint8_t x278 = addcarryx_u64(0x0, x269, x271); + uint64_t x280, uint8_t x281 = addcarryx_u64(x278, x272, 0x0); + uint64_t x283, uint8_t x284 = addcarryx_u64(x281, 0x0, x274); + uint64_t x286, uint8_t _ = addcarryx_u64(0x0, x284, x275); + uint64_t _, uint8_t x290 = addcarryx_u64(0x0, x253, x268); + uint64_t x292, uint8_t x293 = addcarryx_u64(x290, x256, x277); + uint64_t x295, uint8_t x296 = addcarryx_u64(x293, x259, x280); + uint64_t x298, uint8_t x299 = addcarryx_u64(x296, x262, x283); + uint64_t x301, uint8_t x302 = addcarryx_u64(x299, x265, x286); + uint8_t x303 = x302 + x266; + uint64_t x304 = (uint64_t) (x303 == 0 ? 0x0 : 0xffffffffffffffffL); + uint64_t x305 = x304 & 0xffffffffffffffffL; + uint64_t x307, uint8_t x308 = subborrow_u64(0x0, x292, x305); + uint64_t x309 = x304 & 0xffffffff; + uint64_t x311, uint8_t x312 = subborrow_u64(x308, x295, x309); + uint64_t x314, uint8_t x315 = subborrow_u64(x312, x298, 0x0); + uint64_t x316 = x304 & 0xffffffff00000001L; + uint64_t x318, uint8_t _ = subborrow_u64(x315, x301, x316); + (Return x318, Return x314, Return x311, Return x307)) +(x, x0)%core + : word64 * word64 * word64 * word64 → word64 * word64 * word64 * word64 → ReturnType (uint64_t * uint64_t * uint64_t * uint64_t) diff --git a/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v new file mode 100644 index 000000000..590879c0c --- /dev/null +++ b/src/Specific/NISTP256/AMD64/IntegrationTestMontgomeryP256Display.v @@ -0,0 +1,4 @@ +Require Import Crypto.Specific.NISTP256.AMD64.IntegrationTestMontgomeryP256. +Require Import Crypto.Specific.IntegrationTestDisplayCommon. + +Check display mul. diff --git a/src/Specific/NISTP256/AMD64/MontgomeryP256.v b/src/Specific/NISTP256/AMD64/MontgomeryP256.v new file mode 100644 index 000000000..dfc57cfdc --- /dev/null +++ b/src/Specific/NISTP256/AMD64/MontgomeryP256.v @@ -0,0 +1,157 @@ +Require Import Coq.micromega.Lia. +Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Definition. +Require Import Crypto.Arithmetic.MontgomeryReduction.WordByWord.Proofs. +Require Import Crypto.Arithmetic.Core. Import B. +Require Import Crypto.Util.Sigma.Lift. +Require Import Coq.ZArith.BinInt. +Require Import Coq.PArith.BinPos. +Require Import Crypto.Util.LetIn. +Require Import Crypto.Util.ZUtil.ModInv. +Require Import Crypto.Util.Tactics.DestructHead. + +Definition wt (i:nat) : Z := Z.shiftl 1 (64*Z.of_nat i). +Definition r := Eval compute in (2^64)%positive. +Definition sz := 4%nat. +Definition m : positive := 2^256-2^224+2^192+2^96-1. +Definition p256 := + Eval vm_compute in + ((Positional.encode (modulo:=modulo) (div:=div) (n:=sz) wt (Z.pos m))). +Definition r' := Eval vm_compute in Zpow_facts.Zpow_mod (Z.pos r) (Z.pos m - 2) (Z.pos m). +Definition r'_correct := eq_refl : ((Z.pos r * r') mod (Z.pos m) = 1)%Z. +Definition m' : Z := Eval vm_compute in Option.invert_Some (Z.modinv_fueled 10 (-Z.pos m) (Z.pos r)). +Definition m'_correct := eq_refl : ((Z.pos m * m') mod (Z.pos r) = (-1) mod Z.pos r)%Z. + +Definition mulmod_256' : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz + | forall (A B : Tuple.tuple Z sz), + f A B = + (redc (r:=r)(R_numlimbs:=sz) p256 A B m') + }. +Proof. + eapply (lift2_sig (fun A B c => c = _)); eexists. + cbv -[Definitions.Z.add_get_carry Definitions.Z.mul_split_at_bitwidth Definitions.Z.zselect runtime_add runtime_mul runtime_and runtime_opp Let_In]. + (* + cbv [ + r wt sz p256 + CPSUtil.Tuple.tl_cps CPSUtil.Tuple.hd_cps + CPSUtil.to_list_cps CPSUtil.to_list'_cps CPSUtil.to_list_cps' CPSUtil.flat_map_cps CPSUtil.fold_right_cps + CPSUtil.map_cps CPSUtil.Tuple.left_append_cps CPSUtil.firstn_cps CPSUtil.combine_cps CPSUtil.on_tuple_cps CPSUtil.update_nth_cps CPSUtil.from_list_default_cps CPSUtil.from_list_default'_cps + fst snd length List.seq List.hd List.app + redc redc_cps redc_loop_cps redc_body_cps + Positional.to_associational_cps + Saturated.divmod_cps + Saturated.scmul_cps + Saturated.uweight + Saturated.Columns.mul_cps + Saturated.Associational.mul_cps + (*Z.of_nat Pos.of_succ_nat Nat.pred + Z.pow Z.pow_pos Z.mul Pos.iter Pos.mul Pos.succ*) + Tuple.hd Tuple.append Tuple.tl Tuple.hd' Tuple.tl' CPSUtil.Tuple.left_tl_cps CPSUtil.Tuple.left_hd_cps CPSUtil.Tuple.hd_cps CPSUtil.Tuple.tl_cps + Saturated.Columns.from_associational_cps + Saturated.Associational.multerm_cps + Saturated.Columns.compact_cps + Saturated.Columns.compact_step_cps + Saturated.Columns.compact_digit_cps + Saturated.drop_high_cps + Saturated.add_cps + Saturated.Columns.add_cps + Saturated.Columns.cons_to_nth_cps + Nat.pred + Saturated.join0 + Saturated.join0_cps CPSUtil.Tuple.left_append_cps + CPSUtil.Tuple.mapi_with_cps + id + Positional.zeros Saturated.zero Saturated.Columns.nils Tuple.repeat Tuple.append + Positional.place_cps + CPSUtil.Tuple.mapi_with'_cps Tuple.hd Tuple.hd' Tuple.tl Tuple.tl' CPSUtil.Tuple.hd_cps CPSUtil.Tuple.tl_cps fst snd + Z.of_nat fst snd Z.pow Z.pow_pos Pos.of_succ_nat Z.div Z.mul Pos.mul Z.modulo Z.div_eucl Z.pos_div_eucl Z.leb Z.compare Pos.compare_cont Pos.compare Z.pow_pos Pos.iter Z.mul Pos.succ Z.ltb Z.gtb Z.geb Z.div Z.sub Z.pos_sub Z.add Z.opp Z.double + Decidable.dec Decidable.dec_eq_Z Z.eq_dec Z_rec Z_rec Z_rect + Positional.zeros Saturated.zero Saturated.Columns.nils Tuple.repeat Tuple.append + (* + Saturated.Associational.multerm_cps + Saturated.Columns.from_associational_cps Positional.place_cps Saturated.Columns.cons_to_nth_cps Saturated.Columns.nils +Tuple.append +Tuple.from_list Tuple.from_list' +Tuple.from_list_default Tuple.from_list_default' +Tuple.hd +Tuple.repeat +Tuple.tl +Tuple.tuple *) + (* Saturated.add_cps Saturated.divmod_cps Saturated.drop_high_cps Saturated.scmul_cps Saturated.zero Saturated.Columns.mul_cps Saturated.Columns.add_cps Saturated.uweight Saturated.T *) + (* + CPSUtil.to_list_cps CPSUtil.to_list'_cps CPSUtil.to_list_cps' +Positional.zeros +Tuple.to_list +Tuple.to_list' +List.hd +List.tl +Nat.max +Positional.to_associational_cps +Z.of_nat *) + ]. + Unset Printing Notations. + + (* cbv -[runtime_add runtime_mul LetIn.Let_In Definitions.Z.add_get_carry_full Definitions.Z.mul_split]. *) + + (* basesystem_partial_evaluation_RHS. *) + *) + reflexivity. +Defined. + + +Definition mulmod_256'' : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz + | forall (A B : Tuple.tuple Z sz), + Saturated.small (Z.pos r) A -> Saturated.small (Z.pos r) B -> + let eval := Saturated.eval (Z.pos r) in + (Saturated.small (Z.pos r) (f A B) + /\ (eval B < eval p256 -> 0 <= eval (f A B) < eval p256) + /\ (eval (f A B) mod Z.pos m + = (eval A * eval B * r'^(Z.of_nat sz)) mod Z.pos m))%Z + }. +Proof. + exists (proj1_sig mulmod_256'). + abstract ( + intros; rewrite (proj2_sig mulmod_256'); + split; [ | split ]; + [ apply small_redc with (ri:=r') | apply redc_bound_N with (ri:=r') | apply redc_mod_N ]; + try match goal with + | _ => assumption + | [ |- _ = _ :> Z ] => vm_compute; reflexivity + | _ => reflexivity + | [ |- Saturated.small (Z.pos r) p256 ] + => hnf; cbv [Tuple.to_list sz p256 r Tuple.to_list' List.In]; intros; destruct_head'_or; + subst; try lia + | [ |- Saturated.eval (Z.pos r) p256 <> 0%Z ] + => vm_compute; lia + end + ). +Defined. + +Import ModularArithmetic. + +(** TODO: MOVE ME *) +Definition montgomery_to_F (v : Z) : F m + := (F.of_Z m v * F.of_Z m (r'^Z.of_nat sz)%Z)%F. + +Definition mulmod_256 : { f:Tuple.tuple Z sz -> Tuple.tuple Z sz -> Tuple.tuple Z sz + | forall (A : Tuple.tuple Z sz) (_ : Saturated.small (Z.pos r) A) + (B : Tuple.tuple Z sz) (_ : Saturated.small (Z.pos r) B), + let eval := Saturated.eval (Z.pos r) in + montgomery_to_F (eval (f A B)) + = (montgomery_to_F (eval A) * montgomery_to_F (eval B))%F }. +Proof. + exists (proj1_sig mulmod_256''). + abstract ( + intros A Asm B Bsm; + pose proof (proj2_sig mulmod_256'' A B Asm Bsm) as H; + cbv zeta in *; + (*split; try solve [ destruct_head'_and; assumption ];*) + rewrite ModularArithmeticTheorems.F.eq_of_Z_iff in H; + unfold montgomery_to_F; + destruct H as [H1 [H2 H3]]; + rewrite H3; + rewrite <- !ModularArithmeticTheorems.F.of_Z_mul; + f_equal; nia + ). +Defined. + +Print Assumptions mulmod_256. |