1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
/*
* x86 MMX and MMX2 packed byte operations in portable C.
* Extra instructions: pdiffub, pcmpzb, psumbw, pcmpgtub
* Author: Zoltan Hidvegi
*/
#ifndef __CMMX_H
#define __CMMX_H
typedef unsigned long cmmx_t;
#define ONE_BYTES (~(cmmx_t)0 / 255)
#define SIGN_BITS (ONE_BYTES << 7)
#define LOWBW_MASK (~(cmmx_t)0 / 257)
static inline cmmx_t
paddb(cmmx_t a, cmmx_t b)
{
return ((a & ~SIGN_BITS) + (b & ~SIGN_BITS)) ^ ((a^b) & SIGN_BITS);
}
static inline cmmx_t
psubb(cmmx_t a, cmmx_t b)
{
return ((a | SIGN_BITS) - (b & ~SIGN_BITS)) ^ (~(a^b) & SIGN_BITS);
}
static inline cmmx_t
paddusb(cmmx_t a, cmmx_t b)
{
cmmx_t s = (a & ~SIGN_BITS) + (b & ~SIGN_BITS);
cmmx_t abs = (a | b) & SIGN_BITS;
cmmx_t c = abs & (s | (a & b));
return s | abs | (abs - (c >> 7));
}
static inline cmmx_t
paddusb_s(cmmx_t a, cmmx_t b)
{
cmmx_t sum = a+b;
cmmx_t ov = sum & SIGN_BITS;
return sum + (sum ^ (ov - (ov>>7)));
}
static inline cmmx_t
psubusb(cmmx_t a, cmmx_t b)
{
cmmx_t s = (a | SIGN_BITS) - (b & ~SIGN_BITS);
cmmx_t anb = a & ~b;
cmmx_t c = (anb | (s & ~(a^b))) & SIGN_BITS;
return s & ((c & anb) | (c - (c >> 7)));
}
static inline cmmx_t
psubusb_s(cmmx_t a, cmmx_t b)
{
cmmx_t d = (a|SIGN_BITS) - b;
cmmx_t m = d & SIGN_BITS;
return d & (m - (m>>7));
}
static inline cmmx_t
pcmpgtub(cmmx_t b, cmmx_t a)
{
cmmx_t s = (a | SIGN_BITS) - (b & ~SIGN_BITS);
cmmx_t ret = ((~a & b) | (~s & ~(a ^ b))) & SIGN_BITS;
return ret | (ret - (ret >> 7));
}
static inline cmmx_t
pdiffub(cmmx_t a, cmmx_t b)
{
cmmx_t xs = (~a ^ b) & SIGN_BITS;
cmmx_t s = ((a | SIGN_BITS) - (b & ~SIGN_BITS)) ^ xs;
cmmx_t gt = ((~a & b) | (s & xs)) & SIGN_BITS;
cmmx_t gt7 = gt >> 7;
return (s ^ gt ^ (gt - gt7)) + gt7;
}
static inline cmmx_t
pdiffub_s(cmmx_t a, cmmx_t b)
{
cmmx_t d = (a|SIGN_BITS) - b;
cmmx_t g = (~d & SIGN_BITS) >> 7;
return (d ^ (SIGN_BITS-g)) + g;
}
static inline cmmx_t
pmaxub(cmmx_t a, cmmx_t b)
{
return psubusb(a,b) + b;
}
static inline cmmx_t
pminub(cmmx_t a, cmmx_t b)
{
return paddusb(a,~b) - ~b;
}
static inline cmmx_t
pminub_s(cmmx_t a, cmmx_t b)
{
cmmx_t d = (a|SIGN_BITS) - b;
cmmx_t m = ~SIGN_BITS + ((d&SIGN_BITS)>>7);
return ((d&m) + b) & ~SIGN_BITS;
}
static inline cmmx_t
pavgb(cmmx_t a, cmmx_t b)
{
cmmx_t ao = a & ONE_BYTES;
cmmx_t bo = b & ONE_BYTES;
return ((a^ao)>>1) + ((b^bo)>>1) + (ao|bo);
}
static inline cmmx_t
pavgb_s(cmmx_t a, cmmx_t b)
{
return ((a+b+ONE_BYTES)>>1) & ~SIGN_BITS;
}
static inline cmmx_t
p31avgb(cmmx_t a, cmmx_t b)
{
cmmx_t ao = a & (3*ONE_BYTES);
cmmx_t bo = b & (3*ONE_BYTES);
return 3*((a^ao)>>2) + ((b^bo)>>2) +
(((3*ao+bo+2*ONE_BYTES)>>2) & (3*ONE_BYTES));
}
static inline cmmx_t
p31avgb_s(cmmx_t a, cmmx_t b)
{
cmmx_t avg = ((a+b)>>1) & ~SIGN_BITS;
return pavgb_s(avg, a);
}
static inline unsigned long
psumbw(cmmx_t a)
{
cmmx_t t = (a & LOWBW_MASK) + ((a>>8) & LOWBW_MASK);
unsigned long ret =
(unsigned long)t + (unsigned long)(t >> (4*sizeof(cmmx_t)));
if (sizeof(cmmx_t) > 4)
ret += ret >> 16;
return ret & 0xffff;
}
static inline unsigned long
psumbw_s(cmmx_t a)
{
unsigned long ret =
(unsigned long)a + (unsigned long)(a >> (4*sizeof(cmmx_t)));
if (sizeof(cmmx_t) <= 4)
return (ret & 0xff) + ((ret>>8) & 0xff);
ret = (ret & 0xff00ff) + ((ret>>8) & 0xff00ff);
ret += ret >> 16;
return ret & 0xffff;
}
static inline unsigned long
psadbw(cmmx_t a, cmmx_t b)
{
return psumbw(pdiffub(a,b));
}
static inline unsigned long
psadbw_s(cmmx_t a, cmmx_t b)
{
return psumbw_s(pdiffub_s(a,b));
}
static inline cmmx_t
pcmpzb(cmmx_t a)
{
cmmx_t ret = (((a | SIGN_BITS) - ONE_BYTES) | a) & SIGN_BITS;
return ~(ret | (ret - (ret >> 7)));
}
static inline cmmx_t
pcmpeqb(cmmx_t a, cmmx_t b)
{
return pcmpzb(a ^ b);
}
#endif
|