diff options
Diffstat (limited to 'bench/btl/libs/hand_vec/hand_vec_interface.hh')
-rwxr-xr-x | bench/btl/libs/hand_vec/hand_vec_interface.hh | 515 |
1 files changed, 426 insertions, 89 deletions
diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh index 5291aac55..538c03ba6 100755 --- a/bench/btl/libs/hand_vec/hand_vec_interface.hh +++ b/bench/btl/libs/hand_vec/hand_vec_interface.hh @@ -68,140 +68,477 @@ public : #endif } - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) +static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) { + asm("#begin matrix_vector_product"); int AN = (N/PacketSize)*PacketSize; + int ANP = (AN/(4*PacketSize))*4*PacketSize; + int bound = (N/4)*4; for (int i=0;i<N;i++) X[i] = 0; - for (int i=0;i<N;i++) + + for (int i=0;i<bound;i+=4) { - real tmp = B[i]; - Packet ptmp = ei_pset1(tmp); - int iN = i*N; + real tmp0 = B[i]; + Packet ptmp0 = ei_pset1(tmp0); + real tmp1 = B[i+1]; + Packet ptmp1 = ei_pset1(tmp1); + real tmp2 = B[i+2]; + Packet ptmp2 = ei_pset1(tmp2); + real tmp3 = B[i+3]; + Packet ptmp3 = ei_pset1(tmp3); + int iN0 = i*N; + int iN1 = (i+1)*N; + int iN2 = (i+2)*N; + int iN3 = (i+3)*N; if (AN>0) { - bool aligned = (iN % PacketSize) == 0; - if (aligned) +// int aligned0 = (iN0 % PacketSize); + int aligned1 = (iN1 % PacketSize); + + if (aligned1==0) { - #ifdef PEELING - int ANP = (AN/(8*PacketSize))*8*PacketSize; - for (int j = 0;j<ANP;j+=PacketSize*8) + for (int j = 0;j<AN;j+=PacketSize) { - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); - ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+PacketSize+iN])))); - ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+2*PacketSize+iN])))); - ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+3*PacketSize+iN])))); - ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+4*PacketSize+iN])))); - ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+5*PacketSize+iN])))); - ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+6*PacketSize+iN])))); - ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+7*PacketSize+iN])))); + ei_pstore(&X[j], + ei_padd(ei_pload(&X[j]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_pload(&A[j+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_pload(&A[j+iN2])),ei_pmul(ptmp3,ei_pload(&A[j+iN3]))) ))); } - for (int j = ANP;j<AN;j+=PacketSize) - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); - #else + } + else if (aligned1==2) + { for (int j = 0;j<AN;j+=PacketSize) - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); - #endif + { + ei_pstore(&X[j], + ei_padd(ei_pload(&X[j]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_pload(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) ))); + } } else { - #ifdef PEELING - int ANP = (AN/(8*PacketSize))*8*PacketSize; - for (int j = 0;j<ANP;j+=PacketSize*8) + for (int j = 0;j<ANP;j+=4*PacketSize) { - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); - ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+PacketSize+iN])))); - ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+2*PacketSize+iN])))); - ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+3*PacketSize+iN])))); - ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+4*PacketSize+iN])))); - ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+5*PacketSize+iN])))); - ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+6*PacketSize+iN])))); - ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+7*PacketSize+iN])))); + ei_pstore(&X[j], + ei_padd(ei_pload(&X[j]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) ))); + + ei_pstore(&X[j+PacketSize], + ei_padd(ei_pload(&X[j+PacketSize]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+PacketSize+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+PacketSize+iN3]))) ))); + + ei_pstore(&X[j+2*PacketSize], + ei_padd(ei_pload(&X[j+2*PacketSize]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+2*PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+2*PacketSize+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+2*PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+2*PacketSize+iN3]))) ))); + + ei_pstore(&X[j+3*PacketSize], + ei_padd(ei_pload(&X[j+3*PacketSize]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+3*PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+3*PacketSize+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+3*PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+3*PacketSize+iN3]))) ))); + } for (int j = ANP;j<AN;j+=PacketSize) - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); - #else - for (int j = 0;j<AN;j+=PacketSize) - ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); - #endif + ei_pstore(&X[j], + ei_padd(ei_pload(&X[j]), + ei_padd( + ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))), + ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) ))); } } // process remaining scalars for (int j=AN;j<N;j++) - X[j] += tmp * A[j+iN]; + X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1]; } + for (int i=bound;i<N;i++) + { + real tmp0 = B[i]; + Packet ptmp0 = ei_pset1(tmp0); + int iN0 = i*N; + if (AN>0) + { + bool aligned0 = (iN0 % PacketSize) == 0; + if (aligned0) + for (int j = 0;j<AN;j+=PacketSize) + ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pload(&X[j]))); + else + for (int j = 0;j<AN;j+=PacketSize) + ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),ei_pload(&X[j]))); + } + // process remaining scalars + for (int j=AN;j<N;j++) + X[j] += tmp0 * A[j+iN0]; + } + asm("#end matrix_vector_product"); } - static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) +// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) +// { +// asm("#begin matrix_vector_product"); +// int AN = (N/PacketSize)*PacketSize; +// for (int i=0;i<N;i++) +// X[i] = 0; +// +// for (int i=0;i<N;i+=2) +// { +// real tmp0 = B[i]; +// Packet ptmp0 = ei_pset1(tmp0); +// real tmp1 = B[i+1]; +// Packet ptmp1 = ei_pset1(tmp1); +// int iN0 = i*N; +// int iN1 = (i+1)*N; +// if (AN>0) +// { +// bool aligned0 = (iN0 % PacketSize) == 0; +// bool aligned1 = (iN1 % PacketSize) == 0; +// +// if (aligned0 && aligned1) +// { +// for (int j = 0;j<AN;j+=PacketSize) +// { +// ei_pstore(&X[j], +// ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_pload(&A[j+iN1])),ei_pload(&X[j])))); +// } +// } +// else if (aligned0) +// { +// for (int j = 0;j<AN;j+=PacketSize) +// { +// ei_pstore(&X[j], +// ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j])))); +// } +// } +// else if (aligned1) +// { +// for (int j = 0;j<AN;j+=PacketSize) +// { +// ei_pstore(&X[j], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_pload(&A[j+iN1])),ei_pload(&X[j])))); +// } +// } +// else +// { +// int ANP = (AN/(4*PacketSize))*4*PacketSize; +// for (int j = 0;j<ANP;j+=4*PacketSize) +// { +// ei_pstore(&X[j], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j])))); +// +// ei_pstore(&X[j+PacketSize], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+PacketSize+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+PacketSize+iN1])),ei_pload(&X[j+PacketSize])))); +// +// ei_pstore(&X[j+2*PacketSize], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+2*PacketSize+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+2*PacketSize+iN1])),ei_pload(&X[j+2*PacketSize])))); +// +// ei_pstore(&X[j+3*PacketSize], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+3*PacketSize+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+3*PacketSize+iN1])),ei_pload(&X[j+3*PacketSize])))); +// } +// for (int j = ANP;j<AN;j+=PacketSize) +// ei_pstore(&X[j], +// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])), +// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j])))); +// } +// } +// // process remaining scalars +// for (int j=AN;j<N;j++) +// X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1]; +// } +// int remaining = (N/2)*2; +// for (int i=remaining;i<N;i++) +// { +// real tmp0 = B[i]; +// Packet ptmp0 = ei_pset1(tmp0); +// int iN0 = i*N; +// if (AN>0) +// { +// bool aligned0 = (iN0 % PacketSize) == 0; +// if (aligned0) +// for (int j = 0;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pload(&X[j]))); +// else +// for (int j = 0;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),ei_pload(&X[j]))); +// } +// // process remaining scalars +// for (int j=AN;j<N;j++) +// X[j] += tmp0 * A[j+iN0]; +// } +// asm("#end matrix_vector_product"); +// } + +// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) +// { +// asm("#begin matrix_vector_product"); +// int AN = (N/PacketSize)*PacketSize; +// for (int i=0;i<N;i++) +// X[i] = 0; +// for (int i=0;i<N;i++) +// { +// real tmp = B[i]; +// Packet ptmp = ei_pset1(tmp); +// int iN = i*N; +// if (AN>0) +// { +// bool aligned = (iN % PacketSize) == 0; +// if (aligned) +// { +// #ifdef PEELING +// Packet A0, A1, A2, X0, X1, X2; +// int ANP = (AN/(8*PacketSize))*8*PacketSize; +// for (int j = 0;j<ANP;j+=PacketSize*8) +// { +// A0 = ei_pload(&A[j+iN]); +// X0 = ei_pload(&X[j]); +// A1 = ei_pload(&A[j+PacketSize+iN]); +// X1 = ei_pload(&X[j+PacketSize]); +// A2 = ei_pload(&A[j+2*PacketSize+iN]); +// X2 = ei_pload(&X[j+2*PacketSize]); +// ei_pstore(&X[j], ei_padd(X0, ei_pmul(ptmp,A0))); +// A0 = ei_pload(&A[j+3*PacketSize+iN]); +// X0 = ei_pload(&X[j+3*PacketSize]); +// ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1))); +// A1 = ei_pload(&A[j+4*PacketSize+iN]); +// X1 = ei_pload(&X[j+4*PacketSize]); +// ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X2), ei_pmul(ptmp,A2))); +// A2 = ei_pload(&A[j+5*PacketSize+iN]); +// X2 = ei_pload(&X[j+5*PacketSize]); +// ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X0), ei_pmul(ptmp,A0))); +// A0 = ei_pload(&A[j+6*PacketSize+iN]); +// X0 = ei_pload(&X[j+6*PacketSize]); +// ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1))); +// A1 = ei_pload(&A[j+7*PacketSize+iN]); +// X1 = ei_pload(&X[j+7*PacketSize]); +// ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X2), ei_pmul(ptmp,A2))); +// ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X0), ei_pmul(ptmp,A0))); +// ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1))); +// // +// // ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); +// // ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+PacketSize+iN])))); +// // ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+2*PacketSize+iN])))); +// // ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+3*PacketSize+iN])))); +// // ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+4*PacketSize+iN])))); +// // ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+5*PacketSize+iN])))); +// // ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+6*PacketSize+iN])))); +// // ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+7*PacketSize+iN])))); +// } +// for (int j = ANP;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); +// #else +// for (int j = 0;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN])))); +// #endif +// } +// else +// { +// #ifdef PEELING +// int ANP = (AN/(8*PacketSize))*8*PacketSize; +// for (int j = 0;j<ANP;j+=PacketSize*8) +// { +// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); +// ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+PacketSize+iN])))); +// ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+2*PacketSize+iN])))); +// ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+3*PacketSize+iN])))); +// ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+4*PacketSize+iN])))); +// ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+5*PacketSize+iN])))); +// ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+6*PacketSize+iN])))); +// ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+7*PacketSize+iN])))); +// } +// for (int j = ANP;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); +// #else +// for (int j = 0;j<AN;j+=PacketSize) +// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN])))); +// #endif +// } +// } +// // process remaining scalars +// for (int j=AN;j<N;j++) +// X[j] += tmp * A[j+iN]; +// } +// asm("#end matrix_vector_product"); +// } + + static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) { int AN = (N/PacketSize)*PacketSize; - for (int i=0;i<N;i++) - X[i] = 0; - for (int i=0;i<N;i++) + int bound = (N/4)*4; + for (int i=0;i<bound;i+=4) { - real tmp = 0; - Packet ptmp = ei_pset1(real(0)); - int iN = i*N; + real tmp0 = 0; + Packet ptmp0 = ei_pset1(real(0)); + real tmp1 = 0; + Packet ptmp1 = ei_pset1(real(0)); + real tmp2 = 0; + Packet ptmp2 = ei_pset1(real(0)); + real tmp3 = 0; + Packet ptmp3 = ei_pset1(real(0)); + int iN0 = i*N; + int iN1 = (i+1)*N; + int iN2 = (i+2)*N; + int iN3 = (i+3)*N; if (AN>0) { - bool aligned = (iN % PacketSize) == 0; - if (aligned) + int align1 = (iN1 % PacketSize); + if (align1==0) { - #ifdef PEELING - int ANP = (AN/(8*PacketSize))*8*PacketSize; - for (int j = 0;j<ANP;j+=PacketSize*8) + for (int j = 0;j<AN;j+=PacketSize) { - ptmp = - ei_padd(ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_pload(&A[j+PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_pload(&A[j+2*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_pload(&A[j+3*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_pload(&A[j+4*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_pload(&A[j+5*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_pload(&A[j+6*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_pload(&A[j+7*PacketSize+iN])), - ptmp)))))))); + Packet b = ei_pload(&B[j]); + ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0]))); + ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload(&A[j+iN1]))); + ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload(&A[j+iN2]))); + ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload(&A[j+iN3]))); } - for (int j = ANP;j<AN;j+=PacketSize) - ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN]))); - #else + } + else if (align1==2) + { for (int j = 0;j<AN;j+=PacketSize) - ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN]))); - #endif + { + Packet b = ei_pload(&B[j]); + ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0]))); + ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_ploadu(&A[j+iN1]))); + ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload(&A[j+iN2]))); + ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_ploadu(&A[j+iN3]))); + } } else { - #ifdef PEELING - int ANP = (AN/(8*PacketSize))*8*PacketSize; - for (int j = 0;j<ANP;j+=PacketSize*8) + for (int j = 0;j<AN;j+=PacketSize) { - ptmp = - ei_padd(ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_ploadu(&A[j+PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_ploadu(&A[j+2*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_ploadu(&A[j+3*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_ploadu(&A[j+4*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_ploadu(&A[j+5*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_ploadu(&A[j+6*PacketSize+iN])), - ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_ploadu(&A[j+7*PacketSize+iN])), - ptmp)))))))); + Packet b = ei_pload(&B[j]); + ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0]))); + ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_ploadu(&A[j+iN1]))); + ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_ploadu(&A[j+iN2]))); + ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_ploadu(&A[j+iN3]))); } - for (int j = ANP;j<AN;j+=PacketSize) - ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN]))); - #else - for (int j = 0;j<AN;j+=PacketSize) - ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN]))); - #endif } - tmp = ei_predux(ptmp); + tmp0 = ei_predux(ptmp0); + tmp1 = ei_predux(ptmp1); + tmp2 = ei_predux(ptmp2); + tmp3 = ei_predux(ptmp3); } // process remaining scalars for (int j=AN;j<N;j++) - tmp += B[j] * A[j+iN]; - X[i] = tmp; + { + tmp0 += B[j] * A[j+iN0]; + tmp1 += B[j] * A[j+iN1]; + tmp2 += B[j] * A[j+iN2]; + tmp3 += B[j] * A[j+iN3]; + } + X[i+0] = tmp0; + X[i+1] = tmp1; + X[i+2] = tmp2; + X[i+3] = tmp3; + } + + for (int i=bound;i<N;i++) + { + real tmp0 = 0; + Packet ptmp0 = ei_pset1(real(0)); + int iN0 = i*N; + if (AN>0) + { + if (iN0 % PacketSize==0) + for (int j = 0;j<AN;j+=PacketSize) + ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN0]))); + else + for (int j = 0;j<AN;j+=PacketSize) + ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN0]))); + tmp0 = ei_predux(ptmp0); + } + // process remaining scalars + for (int j=AN;j<N;j++) + tmp0 += B[j] * A[j+iN0]; + X[i+0] = tmp0; } } +// static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) +// { +// int AN = (N/PacketSize)*PacketSize; +// for (int i=0;i<N;i++) +// X[i] = 0; +// for (int i=0;i<N;i++) +// { +// real tmp = 0; +// Packet ptmp = ei_pset1(real(0)); +// int iN = i*N; +// if (AN>0) +// { +// bool aligned = (iN % PacketSize) == 0; +// if (aligned) +// { +// #ifdef PEELING +// int ANP = (AN/(8*PacketSize))*8*PacketSize; +// for (int j = 0;j<ANP;j+=PacketSize*8) +// { +// ptmp = +// ei_padd(ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_pload(&A[j+PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_pload(&A[j+2*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_pload(&A[j+3*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_pload(&A[j+4*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_pload(&A[j+5*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_pload(&A[j+6*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_pload(&A[j+7*PacketSize+iN])), +// ptmp)))))))); +// } +// for (int j = ANP;j<AN;j+=PacketSize) +// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN]))); +// #else +// for (int j = 0;j<AN;j+=PacketSize) +// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN]))); +// #endif +// } +// else +// { +// #ifdef PEELING +// int ANP = (AN/(8*PacketSize))*8*PacketSize; +// for (int j = 0;j<ANP;j+=PacketSize*8) +// { +// ptmp = +// ei_padd(ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_ploadu(&A[j+PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_ploadu(&A[j+2*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_ploadu(&A[j+3*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_ploadu(&A[j+4*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_ploadu(&A[j+5*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_ploadu(&A[j+6*PacketSize+iN])), +// ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_ploadu(&A[j+7*PacketSize+iN])), +// ptmp)))))))); +// } +// for (int j = ANP;j<AN;j+=PacketSize) +// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN]))); +// #else +// for (int j = 0;j<AN;j+=PacketSize) +// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN]))); +// #endif +// } +// tmp = ei_predux(ptmp); +// } +// // process remaining scalars +// for (int j=AN;j<N;j++) +// tmp += B[j] * A[j+iN]; +// X[i] = tmp; +// } +// } + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ int AN = (N/PacketSize)*PacketSize; if (AN>0) |