Eigen/src/Core/arch/AltiVec/PacketMath.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.

#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
#define EIGEN_PACKET_MATH_ALTIVEC_H

#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif

typedef __vector float          v4f;
typedef __vector int            v4i;
typedef __vector unsigned int   v4ui;
typedef __vector __bool int     v4bi;

// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead

#define USE_CONST_v0i     const v4i   v0i   = vec_splat_s32(0)
#define USE_CONST_v1i     const v4i   v1i   = vec_splat_s32(1)
#define USE_CONST_v16i_   const v4i   v16i_ = vec_splat_s32(-16)
#define USE_CONST_v0f     USE_CONST_v0i; const v4f v0f = (v4f) v0i
#define USE_CONST_v1f     USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0)
#define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1)
#define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)

template<> struct ei_packet_traits<float> : ei_default_packet_traits
{ typedef v4f type; enum {size=4}; };
template<> struct ei_packet_traits<int>   : ei_default_packet_traits
{ typedef v4i type; enum {size=4}; };

template<> struct ei_unpacket_traits<v4f>  { typedef float  type; enum {size=4}; };
template<> struct ei_unpacket_traits<v4i>  { typedef int    type; enum {size=4}; };

inline std::ostream & operator <<(std::ostream & s, const v4f & v)
{
  union {
    v4f   v;
    float n[4];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
}

inline std::ostream & operator <<(std::ostream & s, const v4i & v)
{
  union {
    v4i   v;
    int n[4];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
}

inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
{
  union {
    v4ui   v;
    unsigned int n[4];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
}

inline std::ostream & operator <<(std::ostream & s, const v4bi & v)
{
  union {
    __vector __bool int v;
    unsigned int n[4];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
}

template<> inline v4f  ei_padd(const v4f&   a, const v4f&   b) { return vec_add(a,b); }
template<> inline v4i  ei_padd(const v4i&   a, const v4i&   b) { return vec_add(a,b); }

template<> inline v4f  ei_psub(const v4f&   a, const v4f&   b) { return vec_sub(a,b); }
template<> inline v4i  ei_psub(const v4i&   a, const v4i&   b) { return vec_sub(a,b); }

template<> EIGEN_STRONG_INLINE v4f ei_pnegate(const v4f& a)
{
  v4i mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
  return vec_xor(a,(v4f) mask);
}

template<> EIGEN_STRONG_INLINE v4i ei_pnegate(const v4i& a)
{
  USE_CONST_v0i;
  return ei_psub(v0i, a);
}

template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
{
  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
  //Set up constants, variables
  v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
  USE_CONST_v0i;
  USE_CONST_v1i;
  USE_CONST_v16i_;

  // Get the absolute values
  a1  = vec_abs(a);
  b1  = vec_abs(b);

  // Get the signs using xor
  v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);

  // Do the multiplication for the asbolute values.
  bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
  low_prod = vec_mulo((__vector short)a1, (__vector short)b1);
  high_prod = vec_msum((__vector short)a1, (__vector short)bswap, v0i);
  high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
  prod = vec_add( low_prod, high_prod );

  // NOR the product and select only the negative elements according to the sign mask
  prod_ = vec_nor(prod, prod);
  prod_ = vec_sel(v0i, prod_, sgn);

  // Add 1 to the result to get the negative numbers
  v1sel = vec_sel(v0i, v1i, sgn);
  prod_ = vec_add(prod_, v1sel);

  // Merge the results back to the final vector.
  prod = vec_sel(prod, prod_, sgn);

  return prod;
}

template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
  v4f t, y_0, y_1, res;
  USE_CONST_v0f;
  USE_CONST_v1f;

  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
  y_0 = vec_re(b);

  // Do one Newton-Raphson iteration to get the needed accuracy
  t = vec_nmsub(y_0, b, v1f);
  y_1 = vec_madd(y_0, t, y_0);

  res = vec_madd(a, y_1, v0f);
  return res;
}

template<> inline v4f  ei_pmadd(const v4f&  a, const v4f&   b, const v4f&  c) { return vec_madd(a, b, c); }

template<> inline v4f  ei_pmin(const v4f&   a, const v4f&   b) { return vec_min(a,b); }
template<> inline v4i  ei_pmin(const v4i&   a, const v4i&   b) { return vec_min(a,b); }

template<> inline v4f  ei_pmax(const v4f&   a, const v4f&   b) { return vec_max(a,b); }
template<> inline v4i  ei_pmax(const v4i&   a, const v4i&   b) { return vec_max(a,b); }

template<> EIGEN_STRONG_INLINE v4f ei_pabs(const v4f& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE v4i ei_pabs(const v4i& a) { return vec_abs(a); }

template<> inline v4f  ei_pload(const float* from) { return vec_ld(0, from); }
template<> inline v4i  ei_pload(const int*   from) { return vec_ld(0, from); }

template<> inline v4f  ei_ploadu(const float*  from)
{
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  __vector unsigned char MSQ, LSQ;
  __vector unsigned char mask;
  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
  mask = vec_lvsl(0, from);                        // create the permute mask
  return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
}

template<> inline v4i  ei_ploadu(const int*    from)
{
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  __vector unsigned char MSQ, LSQ;
  __vector unsigned char mask;
  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
  mask = vec_lvsl(0, from);                        // create the permute mask
  return (v4i) vec_perm(MSQ, LSQ, mask);    // align the data
}

template<> inline v4f  ei_pset1(const float&  from)
{
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  float __attribute__(aligned(16)) af[4];
  af[0] = from;
  v4f vc = vec_ld(0, af);
  vc = vec_splat(vc, 0);
  return vc;
}

template<> inline v4i  ei_pset1(const int&    from)
{
  int __attribute__(aligned(16)) ai[4];
  ai[0] = from;
  v4i vc = vec_ld(0, ai);
  vc = vec_splat(vc, 0);
  return vc;
}

template<> inline void ei_pstore(float*   to, const v4f&   from) { vec_st(from, 0, to); }
template<> inline void ei_pstore(int*     to, const v4i&   from) { vec_st(from, 0, to); }

template<> inline void ei_pstoreu(float*  to, const v4f&   from)
{
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  // Warning: not thread safe!
  __vector unsigned char MSQ, LSQ, edges;
  __vector unsigned char edgeAlign, align;

  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
  align = vec_lvsr( 0, to );                                // permute map to misalign data
  MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
  LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
}

template<> inline void ei_pstoreu(int*    to , const v4i&    from )
{
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  // Warning: not thread safe!
  __vector unsigned char MSQ, LSQ, edges;
  __vector unsigned char edgeAlign, align;

  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
  align = vec_lvsr( 0, to );                                // permute map to misalign data
  MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
  LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
}

template<> inline float  ei_pfirst(const v4f&  a)
{
  float EIGEN_ALIGN_128 af[4];
  vec_st(a, 0, af);
  return af[0];
}

template<> inline int    ei_pfirst(const v4i&  a)
{
  int EIGEN_ALIGN_128 ai[4];
  vec_st(a, 0, ai);
  return ai[0];
}

template<> EIGEN_STRONG_INLINE v4f ei_preverse(const v4f& a)
{
  static const __vector unsigned char reverse_mask =
    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
  return (v4f)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
}
template<> EIGEN_STRONG_INLINE v4i ei_preverse(const v4i& a)
{
  static const __vector unsigned char __attribute__(aligned(16)) reverse_mask =
    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
  return (v4i)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
}

inline v4f ei_preduxp(const v4f* vecs)
{
  v4f v[4], sum[4];

  // It's easier and faster to transpose then add as columns
  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
  // Do the transpose, first set of moves
  v[0] = vec_mergeh(vecs[0], vecs[2]);
  v[1] = vec_mergel(vecs[0], vecs[2]);
  v[2] = vec_mergeh(vecs[1], vecs[3]);
  v[3] = vec_mergel(vecs[1], vecs[3]);
  // Get the resulting vectors
  sum[0] = vec_mergeh(v[0], v[2]);
  sum[1] = vec_mergel(v[0], v[2]);
  sum[2] = vec_mergeh(v[1], v[3]);
  sum[3] = vec_mergel(v[1], v[3]);

  // Now do the summation:
  // Lines 0+1
  sum[0] = vec_add(sum[0], sum[1]);
  // Lines 2+3
  sum[1] = vec_add(sum[2], sum[3]);
  // Add the results
  sum[0] = vec_add(sum[0], sum[1]);
  return sum[0];
}

inline v4i  ei_preduxp(const v4i* vecs)
{
  v4i v[4], sum[4];

  // It's easier and faster to transpose then add as columns
  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
  // Do the transpose, first set of moves
  v[0] = vec_mergeh(vecs[0], vecs[2]);
  v[1] = vec_mergel(vecs[0], vecs[2]);
  v[2] = vec_mergeh(vecs[1], vecs[3]);
  v[3] = vec_mergel(vecs[1], vecs[3]);
  // Get the resulting vectors
  sum[0] = vec_mergeh(v[0], v[2]);
  sum[1] = vec_mergel(v[0], v[2]);
  sum[2] = vec_mergeh(v[1], v[3]);
  sum[3] = vec_mergel(v[1], v[3]);

  // Now do the summation:
  // Lines 0+1
  sum[0] = vec_add(sum[0], sum[1]);
  // Lines 2+3
  sum[1] = vec_add(sum[2], sum[3]);
  // Add the results
  sum[0] = vec_add(sum[0], sum[1]);
  return sum[0];
}

inline float ei_predux(const v4f& a)
{
  v4f b, sum;
  b = (v4f)vec_sld(a, a, 8);
  sum = vec_add(a, b);
  b = (v4f)vec_sld(sum, sum, 4);
  sum = vec_add(sum, b);
  return ei_pfirst(sum);
}

inline int ei_predux(const v4i& a)
{
  USE_CONST_v0i;
  v4i sum;
  sum = vec_sums(a, v0i);
  sum = vec_sld(sum, v0i, 12);
  return ei_pfirst(sum);
}

// implement other reductions operators
inline float ei_predux_mul(const v4f& a)
{
  v4f prod;
  prod = ei_pmul(a, (v4f)vec_sld(a, a, 8));
  return ei_pfirst(ei_pmul(prod, (v4f)vec_sld(prod, prod, 4)));
}

inline int ei_predux_mul(const v4i& a)
{
  EIGEN_ALIGN_128 int aux[4];
  ei_pstore(aux, a);
  return aux[0] * aux[1] * aux[2] * aux[3];
}

inline float ei_predux_min(const v4f& a)
{
  v4f b, res;
  b = vec_min(a, vec_sld(a, a, 8));
  res = vec_min(b, vec_sld(b, b, 4));
  return ei_pfirst(res);
}

inline int ei_predux_min(const v4i& a)
{
  v4i b, res;
  b = vec_min(a, vec_sld(a, a, 8));
  res = vec_min(b, vec_sld(b, b, 4));
  return ei_pfirst(res);
}

inline float ei_predux_max(const v4f& a)
{
  v4f b, res;
  b = vec_max(a, vec_sld(a, a, 8));
  res = vec_max(b, vec_sld(b, b, 4));
  return ei_pfirst(res);
}

inline int ei_predux_max(const v4i& a)
{
  v4i b, res;
  b = vec_max(a, vec_sld(a, a, 8));
  res = vec_max(b, vec_sld(b, b, 4));
  return ei_pfirst(res);
}


template<int Offset>
struct ei_palign_impl<Offset, v4f>
{
  inline static void run(v4f& first, const v4f& second)
  {
    first = vec_sld(first, second, Offset*4);
  }
};

template<int Offset>
struct ei_palign_impl<Offset, v4i>
{
  inline static void run(v4i& first, const v4i& second)
  {
    first = vec_sld(first, second, Offset*4);
  }
};

#endif // EIGEN_PACKET_MATH_ALTIVEC_H