libitm.exp: Reorder lib loads into dependency order.
[gcc.git] / gcc / config / spu / vmx2spu.h
1 /* Copyright (C) 2006-2013 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 Under Section 7 of GPL version 3, you are granted additional
14 permissions described in the GCC Runtime Library Exception, version
15 3.1, as published by the Free Software Foundation.
16
17 You should have received a copy of the GNU General Public License and
18 a copy of the GCC Runtime Library Exception along with this program;
19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
20 <http://www.gnu.org/licenses/>. */
21
22 #ifndef _VMX2SPU_H_
23 #define _VMX2SPU_H_ 1
24
25 #ifdef __cplusplus
26
27 #ifdef __SPU__
28
29 #include <spu_intrinsics.h>
30 #include <vec_types.h>
31
32 /* This file maps generic VMX intrinsics and predicates to the SPU using
33 * overloaded C++ functions.
34 */
35
36 /************************************************************************
37 * INTRINSICS
38 ************************************************************************/
39
40 /* vec_abs (vector absolute value)
41 * =======
42 */
43 static inline vec_char16 vec_abs(vec_char16 a)
44 {
45 vec_char16 minus_a;
46
47 minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
49 }
50
51 static inline vec_short8 vec_abs(vec_short8 a)
52 {
53 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
54 }
55
56 static inline vec_int4 vec_abs(vec_int4 a)
57 {
58 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
59 }
60
61 static inline vec_float4 vec_abs(vec_float4 a)
62 {
63 return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
64 }
65
66 /* vec_abss (vector absolute value saturate)
67 * ========
68 */
69 static inline vec_char16 vec_abss(vec_char16 a)
70 {
71 vec_char16 minus_a;
72
73 minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
76 }
77
78 static inline vec_short8 vec_abss(vec_short8 a)
79 {
80 vec_short8 minus_a;
81
82 minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
84 }
85
86 static inline vec_int4 vec_abss(vec_int4 a)
87 {
88 vec_int4 minus_a;
89
90 minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
92 }
93
94
95 /* vec_add (vector add)
96 * =======
97 */
98 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
99 {
100 return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102 spu_splats((unsigned short)(0xFF00)))));
103 }
104
105 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
106 {
107 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
108 }
109
110 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
111 {
112 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
113 }
114
115 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
116 {
117 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
118 }
119
120 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
121 {
122 return (spu_add(a, b));
123 }
124
125 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
126 {
127 return (spu_add(a, b));
128 }
129
130 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
131 {
132 return (spu_add((vec_short8)(a), b));
133 }
134
135 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
136 {
137 return (spu_add(a, (vec_short8)(b)));
138 }
139
140 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
141 {
142 return (spu_add(a, b));
143 }
144
145 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
146 {
147 return (spu_add(a, b));
148 }
149
150 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
151 {
152 return (spu_add((vec_int4)(a), b));
153 }
154
155 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
156 {
157 return (spu_add(a, (vec_int4)(b)));
158 }
159
160 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
161 {
162 return (spu_add(a, b));
163 }
164
165 /* vec_addc (vector add carryout unsigned word)
166 * ========
167 */
168 #define vec_addc(_a, _b) spu_genc(_a, _b)
169
170 /* vec_adds (vector add saturated)
171 * ========
172 */
173 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
174 {
175 vec_uchar16 s1, s2, s, d;
176
177 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179 s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
180 8, 24, 10, 26, 12, 28, 14, 30}));
181 d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
182 9, 25, 11, 27, 13, 29, 15, 31}));
183 return (spu_or(d, spu_cmpeq(s, 1)));
184 }
185
186 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
187 {
188 vec_uchar16 s1, s2, s, d;
189
190 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192 s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
193 9, 25, 11, 27, 13, 29, 15, 31}));
194 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196 return ((vec_char16)(d));
197 }
198
199 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
200 {
201 return (vec_adds((vec_char16)(a), b));
202 }
203
204 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
205 {
206 return (vec_adds(a, (vec_char16)(b)));
207 }
208
209 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
210 {
211 vec_ushort8 s, d;
212
213 s = spu_add(a, b);
214 d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215 return (d);
216 }
217
218 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
219 {
220 vec_short8 s, d;
221
222 s = spu_add(a, b);
223 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225 return (d);
226 }
227
228 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
229 {
230 return (vec_adds((vec_short8)(a), b));
231 }
232
233 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
234 {
235 return (vec_adds(a, (vec_short8)(b)));
236 }
237
238 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
239 {
240 return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
241 }
242
243 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
244 {
245 vec_int4 s, d;
246
247 s = spu_add(a, b);
248 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250 return (d);
251 }
252
253 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
254 {
255 return (vec_adds((vec_int4)(a), b));
256 }
257
258 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
259 {
260 return (vec_adds(a, (vec_int4)(b)));
261 }
262
263 /* vec_and (vector logical and)
264 * =======
265 */
266 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
267 {
268 return (spu_and(a, b));
269 }
270
271 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
272 {
273 return (spu_and(a, b));
274 }
275
276 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
277 {
278 return (spu_and((vec_char16)(a), b));
279 }
280
281 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
282 {
283 return (spu_and(a, (vec_char16)(b)));
284 }
285
286 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
287 {
288 return (spu_and(a, b));
289 }
290
291 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
292 {
293 return (spu_and(a, b));
294 }
295
296 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
297 {
298 return (spu_and((vec_short8)(a), b));
299 }
300
301 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
302 {
303 return (spu_and(a, (vec_short8)(b)));
304 }
305
306 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
307 {
308 return (spu_and(a, b));
309 }
310
311 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
312 {
313 return (spu_and(a, b));
314 }
315
316 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
317 {
318 return (spu_and((vec_int4)(a), b));
319 }
320
321 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
322 {
323 return (spu_and(a, (vec_int4)(b)));
324 }
325
326 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
327 {
328 return (spu_and(a, b));
329 }
330
331 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
332 {
333 return (spu_and((vec_float4)(a),b));
334 }
335
336 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
337 {
338 return (spu_and(a, (vec_float4)(b)));
339 }
340
341
342 /* vec_andc (vector logical and with complement)
343 * ========
344 */
345 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
346 {
347 return (spu_andc(a, b));
348 }
349
350 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
351 {
352 return (spu_andc(a, b));
353 }
354
355 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
356 {
357 return (spu_andc((vec_char16)(a), b));
358 }
359
360 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
361 {
362 return (spu_andc(a, (vec_char16)(b)));
363 }
364
365 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
366 {
367 return (spu_andc(a, b));
368 }
369
370 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
371 {
372 return (spu_andc(a, b));
373 }
374
375 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
376 {
377 return (spu_andc((vec_short8)(a), b));
378 }
379
380 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
381 {
382 return (spu_andc(a, (vec_short8)(b)));
383 }
384
385 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
386 {
387 return (spu_andc(a, b));
388 }
389
390 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
391 {
392 return (spu_andc(a, b));
393 }
394
395 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
396 {
397 return (spu_andc((vec_int4)(a), b));
398 }
399
400 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
401 {
402 return (spu_andc(a, (vec_int4)(b)));
403 }
404
405 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
406 {
407 return (spu_andc(a,b));
408 }
409
410 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
411 {
412 return (spu_andc((vec_float4)(a),b));
413 }
414
415 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
416 {
417 return (spu_andc(a, (vec_float4)(b)));
418 }
419
420 /* vec_avg (vector average)
421 * =======
422 */
423 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
424 {
425 return (spu_avg(a, b));
426 }
427
428 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
429 {
430 return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431 (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
432 }
433
434 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
435 {
436 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437 spu_and(spu_or(a, b), 1)));
438 }
439
440 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
441 {
442 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443 spu_and(spu_or(a, b), 1)));
444 }
445
446 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
447 {
448 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449 spu_and(spu_or(a, b), 1)));
450 }
451
452 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
453 {
454 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455 spu_and(spu_or(a, b), 1)));
456 }
457
458
459 /* vec_ceil (vector ceiling)
460 * ========
461 */
462 static inline vec_float4 vec_ceil(vec_float4 a)
463 {
464 vec_int4 exp;
465 vec_uint4 mask;
466
467 a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
472
473 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
474 }
475
476
477 /* vec_cmpb (vector compare bounds floating-point)
478 * ========
479 */
480 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
481 {
482 vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483 vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
484
485 return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
487 }
488
489 /* vec_cmpeq (vector compare equal)
490 * =========
491 */
492 #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b)
493
494
495 /* vec_cmpge (vector compare greater than or equal)
496 * =========
497 */
498 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
499 {
500 return (spu_xor(spu_cmpgt(b, a), -1));
501 }
502
503
504 /* vec_cmpgt (vector compare greater than)
505 * =========
506 */
507 #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b)
508
509
510 /* vec_cmple (vector compare less than or equal)
511 * =========
512 */
513 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
514 {
515 return (spu_xor(spu_cmpgt(a, b), -1));
516 }
517
518
519 /* vec_cmplt (vector compare less than)
520 * =========
521 */
522 #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a)
523
524
525 /* vec_ctf (vector convert from fixed-point word)
526 * =======
527 */
528 #define vec_ctf(_a, _b) spu_convtf(_a, _b)
529
530
531 /* vec_cts (vector convert to signed fixed-point word saturate)
532 * =======
533 */
534 #define vec_cts(_a, _b) spu_convts(_a, _b)
535
536
537 /* vec_ctu (vector convert to unsigned fixed-point word saturate)
538 * =======
539 */
540 #define vec_ctu(_a, _b) spu_convtu(_a, _b)
541
542
543 /* vec_dss (vector data stream stop)
544 * =======
545 */
546 #define vec_dss(_a)
547
548
549 /* vec_dssall (vector data stream stop all)
550 * ==========
551 */
552 #define vec_dssall()
553
554
555 /* vec_dst (vector data stream touch)
556 * =======
557 */
558 #define vec_dst(_a, _b, _c)
559
560
561 /* vec_dstst (vector data stream touch for store)
562 * =========
563 */
564 #define vec_dstst(_a, _b, _c)
565
566
567 /* vec_dststt (vector data stream touch for store transient)
568 * ==========
569 */
570 #define vec_dststt(_a, _b, _c)
571
572
573 /* vec_dstt (vector data stream touch transient)
574 * ========
575 */
576 #define vec_dstt(_a, _b, _c)
577
578
579 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580 * =========
581 */
582 static inline vec_float4 vec_expte(vec_float4 a)
583 {
584 vec_float4 bias, frac, exp;
585 vec_int4 ia;
586
587 bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588 ia = spu_convts(spu_add(a, bias), 0);
589 frac = spu_sub(spu_convtf(ia, 0), a);
590 exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
591
592 return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593 frac, spu_splats(1.0f)), exp));
594 }
595
596
597 /* vec_floor (vector floor)
598 * =========
599 */
600 static inline vec_float4 vec_floor(vec_float4 a)
601 {
602 vec_int4 exp;
603 vec_uint4 mask;
604
605 a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
610
611 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
612 }
613
614
615 /* vec_ld (vector load indexed)
616 * ======
617 */
618 static inline vec_uchar16 vec_ld(int a, unsigned char *b)
619 {
620 return (*((vec_uchar16 *)(b+a)));
621 }
622
623 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
624 {
625 return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
626 }
627
628 static inline vec_char16 vec_ld(int a, signed char *b)
629 {
630 return (*((vec_char16 *)(b+a)));
631 }
632
633 static inline vec_char16 vec_ld(int a, vec_char16 *b)
634 {
635 return (*((vec_char16 *)((signed char *)(b)+a)));
636 }
637
638 static inline vec_ushort8 vec_ld(int a, unsigned short *b)
639 {
640 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
641 }
642
643 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
644 {
645 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
646 }
647
648 static inline vec_short8 vec_ld(int a, signed short *b)
649 {
650 return (*((vec_short8 *)((unsigned char *)(b)+a)));
651 }
652
653 static inline vec_short8 vec_ld(int a, vec_short8 *b)
654 {
655 return (*((vec_short8 *)((signed char *)(b)+a)));
656 }
657
658 static inline vec_uint4 vec_ld(int a, unsigned int *b)
659 {
660 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
661 }
662
663 static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
664 {
665 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
666 }
667
668 static inline vec_int4 vec_ld(int a, signed int *b)
669 {
670 return (*((vec_int4 *)((unsigned char *)(b)+a)));
671 }
672
673 static inline vec_int4 vec_ld(int a, vec_int4 *b)
674 {
675 return (*((vec_int4 *)((signed char *)(b)+a)));
676 }
677
678 static inline vec_float4 vec_ld(int a, float *b)
679 {
680 return (*((vec_float4 *)((unsigned char *)(b)+a)));
681 }
682
683 static inline vec_float4 vec_ld(int a, vec_float4 *b)
684 {
685 return (*((vec_float4 *)((unsigned char *)(b)+a)));
686 }
687
688 /* vec_lde (vector load element indexed)
689 * =======
690 */
691 static inline vec_uchar16 vec_lde(int a, unsigned char *b)
692 {
693 return (*((vec_uchar16 *)(b+a)));
694 }
695
696 static inline vec_char16 vec_lde(int a, signed char *b)
697 {
698 return (*((vec_char16 *)(b+a)));
699 }
700
701 static inline vec_ushort8 vec_lde(int a, unsigned short *b)
702 {
703 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
704 }
705
706 static inline vec_short8 vec_lde(int a, signed short *b)
707 {
708 return (*((vec_short8 *)((unsigned char *)(b)+a)));
709 }
710
711
712 static inline vec_uint4 vec_lde(int a, unsigned int *b)
713 {
714 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
715 }
716
717 static inline vec_int4 vec_lde(int a, signed int *b)
718 {
719 return (*((vec_int4 *)((unsigned char *)(b)+a)));
720 }
721
722
723 static inline vec_float4 vec_lde(int a, float *b)
724 {
725 return (*((vec_float4 *)((unsigned char *)(b)+a)));
726 }
727
728 /* vec_ldl (vector load indexed LRU)
729 * =======
730 */
731 #define vec_ldl(_a, _b) vec_ld(_a, _b)
732
733
734 /* vec_loge (vector log2 estimate floating-point)
735 * ========
736 */
737 static inline vec_float4 vec_loge(vec_float4 a)
738 {
739 vec_int4 exp;
740 vec_float4 frac;
741
742 exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743 frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
744
745 return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746 frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
747 }
748
749
750 /* vec_lvsl (vector load for shift left)
751 * ========
752 */
753 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
754 {
755 return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756 ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757 0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
758 }
759
760 static inline vec_uchar16 vec_lvsl(int a, signed char *b)
761 {
762 return (vec_lvsl(a, (unsigned char *)b));
763 }
764
765 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
766 {
767 return (vec_lvsl(a, (unsigned char *)b));
768 }
769
770 static inline vec_uchar16 vec_lvsl(int a, short *b)
771 {
772 return (vec_lvsl(a, (unsigned char *)b));
773 }
774
775 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
776 {
777 return (vec_lvsl(a, (unsigned char *)b));
778 }
779
780 static inline vec_uchar16 vec_lvsl(int a, int *b)
781 {
782 return (vec_lvsl(a, (unsigned char *)b));
783 }
784
785 static inline vec_uchar16 vec_lvsl(int a, float *b)
786 {
787 return (vec_lvsl(a, (unsigned char *)b));
788 }
789
790
791 /* vec_lvsr (vector load for shift right)
792 * ========
793 */
794 static inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
795 {
796 return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
799 }
800
801 static inline vec_uchar16 vec_lvsr(int a, signed char *b)
802 {
803 return (vec_lvsr(a, (unsigned char *)b));
804 }
805
806 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
807 {
808 return (vec_lvsr(a, (unsigned char *)b));
809 }
810
811 static inline vec_uchar16 vec_lvsr(int a, short *b)
812 {
813 return (vec_lvsr(a, (unsigned char *)b));
814 }
815
816 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
817 {
818 return (vec_lvsr(a, (unsigned char *)b));
819 }
820
821 static inline vec_uchar16 vec_lvsr(int a, int *b)
822 {
823 return (vec_lvsr(a, (unsigned char *)b));
824 }
825
826 static inline vec_uchar16 vec_lvsr(int a, float *b)
827 {
828 return (vec_lvsr(a, (unsigned char *)b));
829 }
830
831 /* vec_madd (vector multiply add)
832 * ========
833 */
834 #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c)
835
836
837
838 /* vec_madds (vector multiply add saturate)
839 * =========
840 */
841 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
842 {
843 return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844 (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845 ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
846 }
847
848 /* vec_max (vector maximum)
849 * =======
850 */
851 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
852 {
853 return (spu_sel(b, a, spu_cmpgt(a, b)));
854 }
855
856 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
857 {
858 return (spu_sel(b, a, spu_cmpgt(a, b)));
859 }
860
861 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
862 {
863 return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
864 }
865
866 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
867 {
868 return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
869 }
870
871 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
872 {
873 return (spu_sel(b, a, spu_cmpgt(a, b)));
874 }
875
876 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
877 {
878 return (spu_sel(b, a, spu_cmpgt(a, b)));
879 }
880
881 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
882 {
883 return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
884 }
885
886 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
887 {
888 return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
889 }
890
891 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
892 {
893 return (spu_sel(b, a, spu_cmpgt(a, b)));
894 }
895
896 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
897 {
898 return (spu_sel(b, a, spu_cmpgt(a, b)));
899 }
900
901 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
902 {
903 return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
904 }
905
906 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
907 {
908 return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
909 }
910
911 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
912 {
913 return (spu_sel(b, a, spu_cmpgt(a, b)));
914 }
915
916
917 /* vec_mergeh (vector merge high)
918 * ==========
919 */
920 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
921 {
922 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923 4, 20, 5, 21, 6, 22, 7, 23})));
924 }
925
926 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
927 {
928 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929 4, 20, 5, 21, 6, 22, 7, 23})));
930 }
931
932 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
933 {
934 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935 4, 5, 20, 21, 6, 7, 22, 23})));
936 }
937
938 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
939 {
940 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941 4, 5, 20, 21, 6, 7, 22, 23})));
942 }
943
944 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
945 {
946 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947 4, 5, 6, 7, 20, 21, 22, 23})));
948 }
949
950 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
951 {
952 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953 4, 5, 6, 7, 20, 21, 22, 23})));
954 }
955
956 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
957 {
958 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959 4, 5, 6, 7, 20, 21, 22, 23})));
960 }
961
962 /* vec_mergel (vector merge low)
963 * ==========
964 */
965 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
966 {
967 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
968 12, 28, 13, 29, 14, 30, 15, 31})));
969 }
970
971 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
972 {
973 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
974 12, 28, 13, 29, 14, 30, 15, 31})));
975 }
976
977 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
978 {
979 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
980 12, 13, 28, 29, 14, 15, 30, 31})));
981 }
982
983 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
984 {
985 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
986 12, 13, 28, 29, 14, 15, 30, 31})));
987 }
988
989 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
990 {
991 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
992 12, 13, 14, 15, 28, 29, 30, 31})));
993 }
994
995 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
996 {
997 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
998 12, 13, 14, 15, 28, 29, 30, 31})));
999 }
1000
1001 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002 {
1003 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
1004 12, 13, 14, 15, 28, 29, 30, 31})));
1005 }
1006
1007 /* vec_mfvscr (vector move from vector status and control register)
1008 * ==========
1009 */
1010 static inline vec_ushort8 vec_mfvscr()
1011 {
1012 return ((vec_ushort8)spu_splats(0)); /* not supported */
1013 }
1014
1015
1016 /* vec_min (vector minimum)
1017 * =======
1018 */
1019 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020 {
1021 return (spu_sel(a, b, spu_cmpgt(a, b)));
1022 }
1023
1024 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025 {
1026 return (spu_sel(a, b, spu_cmpgt(a, b)));
1027 }
1028
1029 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030 {
1031 return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032 }
1033
1034 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035 {
1036 return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037 }
1038
1039 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040 {
1041 return (spu_sel(a, b, spu_cmpgt(a, b)));
1042 }
1043
1044 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045 {
1046 return (spu_sel(a, b, spu_cmpgt(a, b)));
1047 }
1048
1049 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050 {
1051 return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052 }
1053
1054 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055 {
1056 return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057 }
1058
1059 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060 {
1061 return (spu_sel(a, b, spu_cmpgt(a, b)));
1062 }
1063
1064 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065 {
1066 return (spu_sel(a, b, spu_cmpgt(a, b)));
1067 }
1068
1069 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070 {
1071 return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072 }
1073
1074 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075 {
1076 return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077 }
1078
1079 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080 {
1081 return (spu_sel(a, b, spu_cmpgt(a, b)));
1082 }
1083
1084 /* vec_mladd (vector multiply low and add unsigned half word)
1085 * =========
1086 */
1087 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088 {
1089 return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090 (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091 (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092 spu_madd(a, b, spu_extend(c)),
1093 ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1094 10, 11, 26, 27, 14, 15, 30, 31}))));
1095 }
1096
1097
1098 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099 {
1100 return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101 }
1102
1103 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104 {
1105 return (vec_mladd((vec_short8)(a), b, c));
1106 }
1107
1108 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109 {
1110 return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111 }
1112
1113
1114 /* vec_mradds (vector multiply round and add saturate)
1115 * ==========
1116 */
1117 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118 {
1119 vec_int4 round = (vec_int4)spu_splats(0x4000);
1120 vec_short8 hi, lo;
1121
1122 hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123 lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124
1125 return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126 }
1127
1128
1129 /* vec_msum (vector multiply sum)
1130 * ========
1131 */
1132 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133 {
1134 vec_ushort8 a1, a2, b1, b2;
1135 vec_uint4 p1, p2;
1136
1137 a1 = spu_and((vec_ushort8)(a), 0xFF);
1138 a2 = spu_rlmask((vec_ushort8)(a), -8);
1139 b1 = spu_and((vec_ushort8)(b), 0xFF);
1140 b2 = spu_rlmask((vec_ushort8)(b), -8);
1141
1142 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144 return (spu_add(p2, spu_add(p1, c)));
1145 }
1146
1147 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148 {
1149 vec_short8 a1, a2, b1, b2;
1150 vec_int4 p1, p2;
1151
1152 a1 = (vec_short8)(spu_extend(a));
1153 a2 = spu_rlmaska((vec_short8)(a), -8);
1154 b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155 b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156
1157 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159 return (spu_add(p2, spu_add(p1, c)));
1160 }
1161
1162 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163 {
1164 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165 }
1166
1167 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168 {
1169 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170 }
1171
1172
1173 /* vec_msums (vector multiply sum saturate)
1174 * ========
1175 */
1176 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177 {
1178 vec_uint4 p1, p2;
1179
1180 p1 = spu_mulo(a, b);
1181 p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182
1183 return (vec_adds(p2, vec_adds(p1, c)));
1184 }
1185
1186 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187 {
1188 return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189 }
1190
1191 /* vec_mtvscr (vector move to vector status and control register)
1192 * ==========
1193 */
1194 #define vec_mtvscr(_a) /* not supported */
1195
1196
1197 /* vec_mule (vector multiply even)
1198 * ========
1199 */
1200 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201 {
1202 vec_ushort8 hi, lo;
1203
1204 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205 (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206 lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207 (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208
1209 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1210 10, 11, 26, 27, 14, 15, 30, 31})));
1211 }
1212
1213 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214 {
1215 vec_short8 hi, lo;
1216
1217 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218 (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219 lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220 (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221
1222 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1223 10, 11, 26, 27, 14, 15, 30, 31})));
1224 }
1225
1226 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227 {
1228 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229 (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230 }
1231
1232
1233 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234 {
1235 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236 (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237 }
1238
1239
1240 /* vec_mulo (vector multiply odd)
1241 * ========
1242 */
1243 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244 {
1245 vec_ushort8 hi, lo;
1246
1247 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248 (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249 lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250
1251 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1252 10, 11, 26, 27, 14, 15, 30, 31})));
1253 }
1254
1255 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256 {
1257 vec_short8 aa, bb, hi, lo;
1258
1259 aa = spu_extend(a);
1260 bb = spu_extend(b);
1261
1262 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264 lo = (vec_short8)spu_mulo(aa, bb);
1265 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1266 10, 11, 26, 27, 14, 15, 30, 31})));
1267 }
1268
1269 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270 {
1271 return (spu_mulo(a, b));
1272 }
1273
1274
1275 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276 {
1277 return (spu_mulo(a, b));
1278 }
1279
1280
1281 /* vec_nmsub (vector negative multiply subtract)
1282 * =========
1283 */
1284 #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c)
1285
1286
1287 /* vec_nor (vector logical nor)
1288 * =======
1289 */
1290 #define vec_nor(_a, _b) spu_nor(_a, _b)
1291
1292
1293 /* vec_or (vector logical or)
1294 * ======
1295 */
1296 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297 {
1298 return (spu_or(a, b));
1299 }
1300
1301 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302 {
1303 return (spu_or(a, b));
1304 }
1305
1306 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307 {
1308 return (spu_or((vec_char16)(a), b));
1309 }
1310
1311 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312 {
1313 return (spu_or(a, (vec_char16)(b)));
1314 }
1315
1316 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317 {
1318 return (spu_or(a, b));
1319 }
1320
1321 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322 {
1323 return (spu_or(a, b));
1324 }
1325
1326 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327 {
1328 return (spu_or((vec_short8)(a), b));
1329 }
1330
1331 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332 {
1333 return (spu_or(a, (vec_short8)(b)));
1334 }
1335
1336 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337 {
1338 return (spu_or(a, b));
1339 }
1340
1341 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342 {
1343 return (spu_or(a, b));
1344 }
1345
1346 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347 {
1348 return (spu_or((vec_int4)(a), b));
1349 }
1350
1351 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352 {
1353 return (spu_or(a, (vec_int4)(b)));
1354 }
1355
1356 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357 {
1358 return (spu_or(a, b));
1359 }
1360
1361 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362 {
1363 return (spu_or((vec_float4)(a),b));
1364 }
1365
1366 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367 {
1368 return (spu_or(a, (vec_float4)(b)));
1369 }
1370
1371
1372 /* vec_pack (vector pack)
1373 * ========
1374 */
1375 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376 {
1377 return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1378 17, 19, 21, 23, 25, 27, 29, 31})));
1379 }
1380
1381 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382 {
1383 return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1384 17, 19, 21, 23, 25, 27, 29, 31})));
1385 }
1386
1387 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388 {
1389 return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1390 18, 19, 22, 23, 26, 27, 30, 31})));
1391 }
1392
1393 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394 {
1395 return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1396 18, 19, 22, 23, 26, 27, 30, 31})));
1397 }
1398
1399
1400 /* vec_packpx (vector pack pixel)
1401 * ==========
1402 */
1403 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404 {
1405 vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406 vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407
1408 return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409 spu_sl(a, 13), x001F),
1410 spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411 spu_sl(b, 13), x001F),
1412 ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13,
1413 16, 17, 20, 21, 24, 25, 28, 29}))));
1414 }
1415
1416
1417 /* vec_packs (vector pack saturate)
1418 * =========
1419 */
1420 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421 {
1422 vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423
1424 return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425 spu_sel(b, max, spu_cmpgt(b, 255)),
1426 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1427 17, 19, 21, 23, 25, 27, 29, 31}))));
1428 }
1429
1430 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431 {
1432 vec_short8 max = spu_splats((signed short)0x007F);
1433 vec_short8 min = spu_splats((signed short)0xFF80);
1434
1435 return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1438 17, 19, 21, 23, 25, 27, 29, 31}))));
1439 }
1440
1441 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442 {
1443 vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444
1445 return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446 spu_sel(b, max, spu_cmpgt(b, max)),
1447 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1448 18, 19, 22, 23, 26, 27, 30, 31}))));
1449 }
1450
1451 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452 {
1453 vec_int4 max = spu_splats((signed int)0x00007FFF);
1454 vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455
1456 return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1459 18, 19, 22, 23, 26, 27, 30, 31}))));
1460 }
1461
1462
1463 /* vec_packsu (vector pack saturate unsigned)
1464 * ==========
1465 */
1466 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467 {
1468 return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469 spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1471 17, 19, 21, 23, 25, 27, 29, 31})));
1472 }
1473
1474 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475 {
1476 vec_short8 max = spu_splats((signed short)0x00FF);
1477 vec_short8 min = spu_splats((signed short)0x0000);
1478
1479 return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1482 17, 19, 21, 23, 25, 27, 29, 31}))));
1483
1484 return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485 }
1486
1487 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488 {
1489 vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490
1491 return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492 spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1494 18, 19, 22, 23, 26, 27, 30, 31})));
1495 }
1496
1497 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498 {
1499 vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500 vec_int4 min = spu_splats((signed int)0x00000000);
1501
1502 return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1505 18, 19, 22, 23, 26, 27, 30, 31}))));
1506 }
1507
1508
1509 /* vec_perm (vector permute)
1510 * ========
1511 */
1512 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513 {
1514 return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515 }
1516
1517 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518 {
1519 return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520 }
1521
1522 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523 {
1524 return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525 }
1526
1527 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528 {
1529 return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530 }
1531
1532 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533 {
1534 return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535 }
1536
1537 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538 {
1539 return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540 }
1541
1542 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543 {
1544 return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545 }
1546
1547
1548 /* vec_re (vector reciprocal estimate)
1549 * ======
1550 */
1551 #define vec_re(_a) spu_re(_a)
1552
1553
1554 /* vec_rl (vector rotate left)
1555 * ======
1556 */
1557 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558 {
1559 vec_ushort8 r1, r2;
1560
1561 r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562 r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563 return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564 }
1565
1566 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567 {
1568 return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569 }
1570
1571 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572 {
1573 return (spu_rl(a, (vec_short8)(b)));
1574 }
1575
1576 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577 {
1578 return (spu_rl(a, (vec_short8)(b)));
1579 }
1580
1581 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582 {
1583 return (spu_rl(a, (vec_int4)(b)));
1584 }
1585
1586 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587 {
1588 return (spu_rl(a, (vec_int4)(b)));
1589 }
1590
1591
1592 /* vec_round (vector round)
1593 * =========
1594 */
1595 static inline vec_float4 vec_round(vec_float4 a)
1596 {
1597 vec_float4 s_half, s_one, d;
1598 vec_uint4 odd;
1599 vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600 vec_float4 half = spu_splats(0.5f);
1601 vec_int4 exp;
1602 vec_uint4 mask;
1603
1604 s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605 a = spu_add(a, s_half);
1606 s_one = spu_add(s_half, s_half);
1607 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611
1612 odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613 s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614 s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615 (vec_float4)spu_cmpeq(odd, 1)));
1616 d = spu_andc(a, (vec_float4)(mask));
1617 d = spu_sub(d, s_one);
1618 return (d);
1619 }
1620
1621 /* vec_rsqrte (vector reciprocal square root estimate)
1622 * ==========
1623 */
1624 #define vec_rsqrte(_a) spu_rsqrte(_a)
1625
1626
1627 /* vec_sel (vector select)
1628 * =======
1629 */
1630 #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c)
1631
1632
1633 /* vec_sl (vector shift left)
1634 * ======
1635 */
1636 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637 {
1638 vec_ushort8 hi, lo;
1639
1640 lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641 hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642
1643 return ((vec_uchar16)(spu_or(hi, lo)));
1644 }
1645
1646 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647 {
1648 return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649 }
1650
1651 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652 {
1653 return (spu_sl(a, spu_and(b, 15)));
1654 }
1655
1656 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657 {
1658 return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659 }
1660
1661 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662 {
1663 return (spu_sl(a, spu_and(b, 31)));
1664 }
1665
1666 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667 {
1668 return (spu_sl(a, spu_and(b, 31)));
1669 }
1670
1671
1672 /* vec_sld (vector shift left double)
1673 * =======
1674 */
1675 #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \
1676 4+(_c), 5+(_c), 6+(_c), 7+(_c), \
1677 8+(_c), 9+(_c), 10+(_c), 11+(_c), \
1678 12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679
1680
1681 /* vec_sll (vector shift left long)
1682 * =======
1683 */
1684 #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685
1686
1687 /* vec_slo (vector shift left by octet)
1688 * =======
1689 */
1690 #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691
1692
1693 /* vec_splat (vector splat)
1694 * =========
1695 */
1696 #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b))
1697
1698
1699 /* vec_splat_s8 (vector splat signed byte)
1700 * ============
1701 */
1702 #define vec_splat_s8(_a) spu_splats((signed char)(_a))
1703
1704
1705 /* vec_splat_s16 (vector splat signed half-word)
1706 * =============
1707 */
1708 #define vec_splat_s16(_a) spu_splats((signed short)(_a))
1709
1710
1711 /* vec_splat_s32 (vector splat signed word)
1712 * =============
1713 */
1714 #define vec_splat_s32(_a) spu_splats((signed int)(_a))
1715
1716
1717 /* vec_splat_u8 (vector splat unsigned byte)
1718 * ============
1719 */
1720 #define vec_splat_u8(_a) spu_splats((unsigned char)(_a))
1721
1722
1723 /* vec_splat_u16 (vector splat unsigned half-word)
1724 * =============
1725 */
1726 #define vec_splat_u16(_a) spu_splats((unsigned short)(_a))
1727
1728
1729 /* vec_splat_u32 (vector splat unsigned word)
1730 * =============
1731 */
1732 #define vec_splat_u32(_a) spu_splats((unsigned int)(_a))
1733
1734
1735 /* vec_sr (vector shift right)
1736 * ======
1737 */
1738 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739 {
1740 vec_ushort8 hi, lo;
1741
1742 lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743 hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744
1745 return ((vec_uchar16)(spu_or(hi, lo)));
1746 }
1747
1748 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749 {
1750 return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751 }
1752
1753 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754 {
1755 return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756 }
1757
1758 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759 {
1760 return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761 }
1762
1763 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764 {
1765 return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766 }
1767
1768 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769 {
1770 return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771 }
1772
1773
1774 /* vec_sra (vector shift right algebraic)
1775 * =======
1776 */
1777 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778 {
1779 vec_short8 hi, lo;
1780
1781 lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782 hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783
1784 return ((vec_char16)(spu_or(hi, lo)));
1785 }
1786
1787 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788 {
1789 return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790 }
1791
1792 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793 {
1794 return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795 }
1796
1797 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798 {
1799 return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800 }
1801
1802 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803 {
1804 return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805 }
1806
1807 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808 {
1809 return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810 }
1811
1812
1813 /* vec_srl (vector shift right long)
1814 * =======
1815 */
1816 #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817
1818
1819 /* vec_sro (vector shift right by octet)
1820 * =======
1821 */
1822 #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823
1824 /* vec_st (vector store indexed)
1825 * ======
1826 */
1827 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828 {
1829 *((vec_uchar16 *)(c+b)) = a;
1830 }
1831
1832 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833 {
1834 *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835 }
1836
1837 static inline void vec_st(vec_char16 a, int b, signed char *c)
1838 {
1839 *((vec_char16 *)(c+b)) = a;
1840 }
1841
1842 static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843 {
1844 *((vec_char16 *)((signed char *)(c)+b)) = a;
1845 }
1846
1847 static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848 {
1849 *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850 }
1851
1852 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853 {
1854 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855 }
1856
1857 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858 {
1859 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860 }
1861
1862 static inline void vec_st(vec_short8 a, int b, signed short *c)
1863 {
1864 *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865 }
1866
1867 static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868 {
1869 *((vec_short8 *)((signed char *)(c)+b)) = a;
1870 }
1871
1872 static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873 {
1874 *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875 }
1876
1877 static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878 {
1879 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880 }
1881
1882 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883 {
1884 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885 }
1886
1887 static inline void vec_st(vec_int4 a, int b, signed int *c)
1888 {
1889 *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890 }
1891
1892 static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893 {
1894 *((vec_int4 *)((signed char *)(c)+b)) = a;
1895 }
1896
1897 static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898 {
1899 *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900 }
1901
1902 static inline void vec_st(vec_float4 a, int b, float *c)
1903 {
1904 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905 }
1906
1907 static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908 {
1909 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910 }
1911
1912
1913 /* vec_ste (vector store element indexed)
1914 * =======
1915 */
1916 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917 {
1918 unsigned char *ptr;
1919
1920 ptr = c + b;
1921 *ptr = spu_extract(a, (int)(ptr) & 15);
1922 }
1923
1924 static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925 {
1926 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927 }
1928
1929 static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930 {
1931 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932 }
1933
1934 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935 {
1936 unsigned short *ptr;
1937
1938 ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939 *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940 }
1941
1942 static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943 {
1944 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945 }
1946
1947 static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948 {
1949 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950 }
1951
1952 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953 {
1954 unsigned int *ptr;
1955
1956 ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957 *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958 }
1959
1960 static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961 {
1962 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963 }
1964
1965 static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966 {
1967 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968 }
1969
1970 static inline void vec_ste(vec_float4 a, int b, float *c)
1971 {
1972 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973 }
1974
1975
1976 /* vec_stl (vector store indexed LRU)
1977 * =======
1978 */
1979 #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c)
1980
1981
1982 /* vec_sub (vector subtract)
1983 * =======
1984 */
1985 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986 {
1987 return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989 spu_splats((unsigned short)0xFF00))));
1990 }
1991
1992 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993 {
1994 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995 }
1996
1997 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998 {
1999 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000 }
2001
2002 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003 {
2004 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005 }
2006
2007 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008 {
2009 return (spu_sub(a, b));
2010 }
2011
2012 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013 {
2014 return (spu_sub(a, b));
2015 }
2016
2017 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018 {
2019 return (spu_sub((vec_short8)(a), b));
2020 }
2021
2022 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023 {
2024 return (spu_sub(a, (vec_short8)(b)));
2025 }
2026
2027 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028 {
2029 return (spu_sub(a, b));
2030 }
2031
2032 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033 {
2034 return (spu_sub(a, b));
2035 }
2036
2037 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038 {
2039 return (spu_sub((vec_int4)(a), b));
2040 }
2041
2042 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043 {
2044 return (spu_sub(a, (vec_int4)(b)));
2045 }
2046
2047 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048 {
2049 return (spu_sub(a, b));
2050 }
2051
2052
2053 /* vec_subc (vector subtract carryout)
2054 * ========
2055 */
2056 #define vec_subc(_a, _b) spu_genb(_a, _b)
2057
2058
2059 /* vec_subs (vector subtract saturate)
2060 * ========
2061 */
2062 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063 {
2064 vec_ushort8 s1, s2;
2065 vec_uchar16 s, d;
2066
2067 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
2070 8, 24, 10, 26, 12, 28, 14, 30})));
2071 d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2072 9, 25, 11, 27, 13, 29, 15, 31})));
2073 return (spu_andc(d, s));
2074 }
2075
2076 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077 {
2078 vec_ushort8 s1, s2;
2079 vec_uchar16 s, d;
2080
2081 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2084 9, 25, 11, 27, 13, 29, 15, 31})));
2085 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087
2088 return ((vec_char16)(d));
2089 }
2090
2091 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092 {
2093 return (vec_subs((vec_char16)(a), b));
2094 }
2095
2096 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097 {
2098 return (vec_subs(a, (vec_char16)(b)));
2099 }
2100
2101 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102 {
2103 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104 }
2105
2106 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107 {
2108 vec_short8 s;
2109 vec_short8 d;
2110
2111 s = spu_sub(a, b);
2112 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114
2115 return (d);
2116 }
2117
2118 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119 {
2120 return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121 }
2122
2123 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124 {
2125 return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126 }
2127
2128 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129 {
2130 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131 }
2132
2133 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134 {
2135 vec_int4 s;
2136 vec_int4 d;
2137
2138 s = spu_sub(a, b);
2139 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141
2142 return (d);
2143 }
2144
2145 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146 {
2147 return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148 }
2149
2150 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151 {
2152 return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153 }
2154
2155
2156 /* vec_sum4s (vector sum across partial (1/4) saturated)
2157 * =========
2158 */
2159 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160 {
2161 vec_uint4 a01_23, a0123;
2162
2163 a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164 spu_and((vec_ushort8)(a), 0xFF)));
2165 a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166 return (vec_adds(a0123, b));
2167 }
2168
2169 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170 {
2171 vec_int4 a01_23, a0123;
2172
2173 a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174 spu_extend(a)));
2175 a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176 return (vec_adds(a0123, b));
2177 }
2178
2179 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180 {
2181 vec_int4 a0123;
2182
2183 a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184 return (vec_adds(a0123, b));
2185 }
2186
2187
2188 /* vec_sum2s (vector sum across partial (1/2) saturated)
2189 * =========
2190 */
2191 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192 {
2193 vec_int4 c, d;
2194 vec_int4 sign1, sign2, sign3;
2195 vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196
2197 sign1 = spu_rlmaska(a, -31);
2198 sign2 = spu_rlmaska(b, -31);
2199
2200 c = spu_rlqwbyte(a, -4);
2201 sign3 = spu_rlqwbyte(sign1, -4);
2202
2203 carry = spu_genc(a, b);
2204 sum_l = spu_add(a, b);
2205 sum_h = spu_addx(sign1, sign2, carry);
2206
2207 carry = spu_genc(sum_l, c);
2208 sum_l = spu_add(sum_l, c);
2209 sum_h = spu_addx(sum_h, sign3, carry);
2210
2211 sign1 = spu_rlmaska(sum_l, -31);
2212 sign2 = spu_rlmaska(sum_h, -31);
2213
2214 sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215
2216 sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217
2218 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219
2220 return (d);
2221 }
2222
2223
2224 /* vec_sums (vector sum saturated)
2225 * ========
2226 */
2227 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228 {
2229 vec_int4 a0, a1, a2, c0, c1, c2, d;
2230 vec_int4 sign_a, sign_b, sign_l, sign_h;
2231 vec_int4 sum_l, sum_h, sat, sat_val;
2232
2233 sign_a = spu_rlmaska(a, -31);
2234 sign_b = spu_rlmaska(b, -31);
2235
2236 a0 = spu_rlqwbyte(a, -12);
2237 a1 = spu_rlqwbyte(a, -8);
2238 a2 = spu_rlqwbyte(a, -4);
2239
2240 sum_l = spu_add(a, b);
2241 sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242
2243 c2 = spu_genc(sum_l, a2);
2244 sum_l = spu_add(sum_l, a2);
2245 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246
2247 c1 = spu_genc(sum_l, a1);
2248 sum_l = spu_add(sum_l, a1);
2249 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250
2251 c0 = spu_genc(sum_l, a0);
2252 sum_l = spu_add(sum_l, a0);
2253 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254
2255 sign_l = spu_rlmaska(sum_l, -31);
2256 sign_h = spu_rlmaska(sum_h, -31);
2257
2258 sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259
2260 sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261
2262 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263
2264 return (d);
2265 }
2266
2267
2268 /* vec_trunc (vector truncate)
2269 * =========
2270 */
2271 static inline vec_float4 vec_trunc(vec_float4 a)
2272 {
2273 vec_int4 exp;
2274 vec_uint4 mask;
2275
2276 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280 return (spu_andc(a, (vec_float4)(mask)));
2281 }
2282
2283 /* vec_unpackh (vector unpack high element)
2284 * ===========
2285 */
2286 static inline vec_short8 vec_unpackh(vec_char16 a)
2287 {
2288 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289 4, 4, 5, 5, 6, 6, 7, 7}))));
2290 }
2291
2292 static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293 {
2294 return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295 }
2296
2297 static inline vec_int4 vec_unpackh(vec_short8 a)
2298 {
2299 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300 0, 0, 4, 5, 0, 0, 6, 7}))));
2301 }
2302
2303 #ifdef SUPPORT_UNPACK_PIXEL
2304 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2305 * can not simultaneously be supported. By default, the boolean short is
2306 * supported.
2307 */
2308 static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309 {
2310 vec_ushort8 p1, p2;
2311
2312 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313 spu_and((vec_ushort8)(a.p), 0x1F),
2314 ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19,
2315 4, 128, 128, 21, 6, 128, 128, 23}));
2316 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317 spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318 ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128,
2319 128, 21, 5, 128, 128, 23, 7, 128}));
2320 return ((vec_uint4)(spu_or(p1, p2)));
2321 }
2322
2323 #else
2324
2325 static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326 {
2327 return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328 }
2329 #endif
2330
2331
2332
2333
2334
2335 /* vec_unpackl (vector unpack low element)
2336 * ===========
2337 */
2338 static inline vec_short8 vec_unpackl(vec_char16 a)
2339 {
2340 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341 12, 12, 13, 13, 14, 14, 15, 15}))));
2342 }
2343
2344 static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345 {
2346 return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347 }
2348
2349
2350 static inline vec_int4 vec_unpackl(vec_short8 a)
2351 {
2352 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353 0, 0,12,13, 0, 0, 14, 15}))));
2354 }
2355
2356
2357 #ifdef SUPPORT_UNPACK_PIXEL
2358 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2359 * can not simultaneously be supported. By default, the boolean short is
2360 * supported.
2361 */
2362 static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363 {
2364 vec_ushort8 p1, p2;
2365
2366 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367 spu_and((vec_ushort8)(a), 0x1F),
2368 ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27,
2369 12, 128, 128, 29, 14, 128, 128, 31}));
2370 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371 spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372 ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128,
2373 128, 29, 13, 128, 128, 31, 15, 128}));
2374 return ((vec_uint4)(spu_or(p1, p2)));
2375 }
2376
2377 #else
2378
2379 static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380 {
2381 return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382
2383 }
2384 #endif
2385
2386
2387
2388 /* vec_xor (vector logical xor)
2389 * ======
2390 */
2391 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392 {
2393 return (spu_xor(a, b));
2394 }
2395
2396 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397 {
2398 return (spu_xor(a, b));
2399 }
2400
2401 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402 {
2403 return (spu_xor((vec_char16)(a), b));
2404 }
2405
2406 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407 {
2408 return (spu_xor(a, (vec_char16)(b)));
2409 }
2410
2411 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412 {
2413 return (spu_xor(a, b));
2414 }
2415
2416 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417 {
2418 return (spu_xor(a, b));
2419 }
2420
2421 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422 {
2423 return (spu_xor((vec_short8)(a), b));
2424 }
2425
2426 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427 {
2428 return (spu_xor(a, (vec_short8)(b)));
2429 }
2430
2431 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432 {
2433 return (spu_xor(a, b));
2434 }
2435
2436 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437 {
2438 return (spu_xor(a, b));
2439 }
2440
2441 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442 {
2443 return (spu_xor((vec_int4)(a), b));
2444 }
2445
2446 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447 {
2448 return (spu_xor(a, (vec_int4)(b)));
2449 }
2450
2451 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452 {
2453 return (spu_xor(a, b));
2454 }
2455
2456 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457 {
2458 return (spu_xor((vec_float4)(a),b));
2459 }
2460
2461 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462 {
2463 return (spu_xor(a, (vec_float4)(b)));
2464 }
2465
2466 /************************************************************************
2467 * PREDICATES
2468 ************************************************************************/
2469
2470 /* vec_all_eq (all elements equal)
2471 * ==========
2472 */
2473 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474 {
2475 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476 }
2477
2478 static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479 {
2480 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481 }
2482
2483 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484 {
2485 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486 }
2487
2488 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489 {
2490 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491 }
2492
2493 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494 {
2495 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496 }
2497
2498 static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499 {
2500 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501 }
2502
2503 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504 {
2505 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506 }
2507
2508 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509 {
2510 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511 }
2512
2513 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514 {
2515 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516 }
2517
2518 static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519 {
2520 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521 }
2522
2523 static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524 {
2525 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526 }
2527
2528 static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529 {
2530 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531 }
2532
2533 static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534 {
2535 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536 }
2537
2538
2539 /* vec_all_ge (all elements greater than or equal)
2540 * ==========
2541 */
2542 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543 {
2544 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545 }
2546
2547 static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548 {
2549 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550 }
2551
2552 static inline int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553 {
2554 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555 }
2556
2557 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558 {
2559 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560 }
2561
2562 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563 {
2564 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565 }
2566
2567 static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568 {
2569 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570 }
2571
2572 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573 {
2574 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575 }
2576
2577 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578 {
2579 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580 }
2581
2582 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583 {
2584 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585 }
2586
2587 static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588 {
2589 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590 }
2591
2592 static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593 {
2594 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595 }
2596
2597 static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598 {
2599 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600 }
2601
2602 static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603 {
2604 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605 }
2606
2607
2608 /* vec_all_gt (all elements greater than)
2609 * ==========
2610 */
2611 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612 {
2613 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614 }
2615
2616 static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617 {
2618 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619 }
2620
2621 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622 {
2623 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624 }
2625
2626 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627 {
2628 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629 }
2630
2631 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632 {
2633 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634 }
2635
2636 static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637 {
2638 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639 }
2640
2641 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642 {
2643 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644 }
2645
2646 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647 {
2648 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649 }
2650
2651 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652 {
2653 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654 }
2655
2656 static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657 {
2658 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659 }
2660
2661 static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662 {
2663 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664 }
2665
2666 static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667 {
2668 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669 }
2670
2671 static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672 {
2673 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674 }
2675
2676
2677 /* vec_all_in (all elements in bounds)
2678 * ==========
2679 */
2680 static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681 {
2682 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683 }
2684
2685
2686 /* vec_all_le (all elements less than or equal)
2687 * ==========
2688 */
2689 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690 {
2691 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692 }
2693
2694 static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695 {
2696 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697 }
2698
2699 static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700 {
2701 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702 }
2703
2704 static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705 {
2706 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707 }
2708
2709 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710 {
2711 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712 }
2713
2714 static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715 {
2716 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717 }
2718
2719 static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720 {
2721 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722 }
2723
2724 static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725 {
2726 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727 }
2728
2729 static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730 {
2731 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732 }
2733
2734 static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735 {
2736 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737 }
2738
2739 static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740 {
2741 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742 }
2743
2744 static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745 {
2746 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747 }
2748
2749 static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750 {
2751 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752 }
2753
2754
2755 /* vec_all_lt (all elements less than)
2756 * ==========
2757 */
2758 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759 {
2760 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761 }
2762
2763 static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764 {
2765 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766 }
2767
2768 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769 {
2770 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771 }
2772
2773 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774 {
2775 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776 }
2777
2778 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779 {
2780 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781 }
2782
2783 static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784 {
2785 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786 }
2787
2788 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789 {
2790 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791 }
2792
2793 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794 {
2795 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796 }
2797
2798 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799 {
2800 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801 }
2802
2803 static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804 {
2805 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806 }
2807
2808 static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809 {
2810 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811 }
2812
2813 static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814 {
2815 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816 }
2817
2818 static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819 {
2820 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821 }
2822
2823
2824 /* vec_all_nan (all elements not a number)
2825 * ===========
2826 */
2827 static inline int vec_all_nan(vec_float4 a)
2828 {
2829 vec_uint4 exp, man;
2830 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831
2832 exp = spu_and((vec_uint4)(a), exp_mask);
2833 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835 spu_cmpeq(man, 0))), 0) == 0xF));
2836 }
2837
2838 #define vec_all_nan(_a) (0)
2839
2840
2841 /* vec_all_ne (all elements not equal)
2842 * ==========
2843 */
2844 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845 {
2846 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847 }
2848
2849 static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850 {
2851 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852 }
2853
2854 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855 {
2856 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857 }
2858
2859 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860 {
2861 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862 }
2863
2864 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865 {
2866 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867 }
2868
2869 static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870 {
2871 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872 }
2873
2874 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875 {
2876 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877 }
2878
2879 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880 {
2881 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882 }
2883
2884 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885 {
2886 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887 }
2888
2889 static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890 {
2891 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892 }
2893
2894 static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895 {
2896 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897 }
2898
2899 static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900 {
2901 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902 }
2903
2904 static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905 {
2906 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907 }
2908
2909
2910 /* vec_all_nge (all elements not greater than or equal)
2911 * ===========
2912 */
2913 static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914 {
2915 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916 }
2917
2918
2919 /* vec_all_ngt (all elements not greater than)
2920 * ===========
2921 */
2922 static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923 {
2924 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925 }
2926
2927
2928 /* vec_all_nle (all elements not less than or equal)
2929 * ===========
2930 */
2931 static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932 {
2933 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934 }
2935
2936
2937 /* vec_all_nlt (all elements not less than)
2938 * ===========
2939 */
2940 static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941 {
2942 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943 }
2944
2945
2946 /* vec_all_numeric (all elements numeric)
2947 * ===========
2948 */
2949 static inline int vec_all_numeric(vec_float4 a)
2950 {
2951 vec_uint4 exp;
2952
2953 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955 }
2956
2957
2958
2959 /* vec_any_eq (any elements equal)
2960 * ==========
2961 */
2962 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963 {
2964 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965 }
2966
2967 static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968 {
2969 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970 }
2971
2972 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973 {
2974 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975 }
2976
2977 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978 {
2979 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980 }
2981
2982 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983 {
2984 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985 }
2986
2987 static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988 {
2989 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990 }
2991
2992 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993 {
2994 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995 }
2996
2997 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998 {
2999 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000 }
3001
3002 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003 {
3004 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005 }
3006
3007 static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008 {
3009 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010 }
3011
3012 static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013 {
3014 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015 }
3016
3017 static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018 {
3019 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020 }
3021
3022 static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023 {
3024 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025 }
3026
3027 /* vec_any_ge (any elements greater than or equal)
3028 * ==========
3029 */
3030 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031 {
3032 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033 }
3034
3035 static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036 {
3037 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038 }
3039
3040 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041 {
3042 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043 }
3044
3045 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046 {
3047 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048 }
3049
3050 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051 {
3052 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053 }
3054
3055 static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056 {
3057 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058 }
3059
3060 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061 {
3062 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063 }
3064
3065 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066 {
3067 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068 }
3069
3070 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071 {
3072 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073 }
3074
3075 static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076 {
3077 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078 }
3079
3080 static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081 {
3082 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083 }
3084
3085 static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086 {
3087 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088 }
3089
3090 static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091 {
3092 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093 }
3094
3095
3096 /* vec_any_gt (any elements greater than)
3097 * ==========
3098 */
3099 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100 {
3101 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102 }
3103
3104 static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105 {
3106 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107 }
3108
3109 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110 {
3111 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112 }
3113
3114 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115 {
3116 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117 }
3118
3119 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120 {
3121 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122 }
3123
3124 static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125 {
3126 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127 }
3128
3129 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130 {
3131 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132 }
3133
3134 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135 {
3136 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137 }
3138
3139
3140 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141 {
3142 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143 }
3144
3145 static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146 {
3147 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148 }
3149
3150 static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151 {
3152 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153 }
3154
3155 static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156 {
3157 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158 }
3159
3160 static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161 {
3162 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163 }
3164
3165 /* vec_any_le (any elements less than or equal)
3166 * ==========
3167 */
3168 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169 {
3170 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171 }
3172
3173 static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174 {
3175 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176 }
3177
3178 static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179 {
3180 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181 }
3182
3183 static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184 {
3185 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186 }
3187
3188 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189 {
3190 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191 }
3192
3193 static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194 {
3195 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196 }
3197
3198 static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199 {
3200 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201 }
3202
3203 static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204 {
3205 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206 }
3207
3208 static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209 {
3210 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211 }
3212
3213 static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214 {
3215 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216 }
3217
3218 static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219 {
3220 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221 }
3222
3223 static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224 {
3225 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226 }
3227
3228 static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229 {
3230 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231 }
3232
3233
3234 /* vec_any_lt (any elements less than)
3235 * ==========
3236 */
3237 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238 {
3239 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240 }
3241
3242 static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243 {
3244 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245 }
3246
3247 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248 {
3249 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250 }
3251
3252 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253 {
3254 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255 }
3256
3257 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258 {
3259 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260 }
3261
3262 static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263 {
3264 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265 }
3266
3267 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268 {
3269 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270 }
3271
3272 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273 {
3274 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275 }
3276
3277 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278 {
3279 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280 }
3281
3282 static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283 {
3284 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285 }
3286
3287 static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288 {
3289 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290 }
3291
3292 static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293 {
3294 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295 }
3296
3297 static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298 {
3299 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300 }
3301
3302 /* vec_any_nan (any elements not a number)
3303 * ===========
3304 */
3305 static inline int vec_any_nan(vec_float4 a)
3306 {
3307 vec_uint4 exp, man;
3308 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309
3310 exp = spu_and((vec_uint4)(a), exp_mask);
3311 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313 spu_cmpeq(man, 0))), 0) != 0));
3314 }
3315
3316
3317 /* vec_any_ne (any elements not equal)
3318 * ==========
3319 */
3320 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321 {
3322 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323 }
3324
3325 static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326 {
3327 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328 }
3329
3330 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331 {
3332 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333 }
3334
3335 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336 {
3337 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338 }
3339
3340 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341 {
3342 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343 }
3344
3345 static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346 {
3347 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348 }
3349
3350 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351 {
3352 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353 }
3354
3355 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356 {
3357 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358 }
3359
3360 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361 {
3362 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363 }
3364
3365 static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366 {
3367 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368 }
3369
3370 static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371 {
3372 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373 }
3374
3375 static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376 {
3377 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378 }
3379
3380 static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381 {
3382 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383 }
3384
3385
3386 /* vec_any_nge (any elements not greater than or equal)
3387 * ===========
3388 */
3389 static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390 {
3391 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392 }
3393
3394 /* vec_any_ngt (any elements not greater than)
3395 * ===========
3396 */
3397 static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398 {
3399 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400 }
3401
3402
3403 /* vec_any_nle (any elements not less than or equal)
3404 * ===========
3405 */
3406 static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407 {
3408 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409 }
3410
3411
3412 /* vec_any_nlt (any elements not less than)
3413 * ===========
3414 */
3415 static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416 {
3417 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418 }
3419
3420
3421 /* vec_any_numeric (any elements numeric)
3422 * ===============
3423 */
3424 static inline int vec_any_numeric(vec_float4 a)
3425 {
3426 vec_uint4 exp;
3427
3428 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430 }
3431
3432
3433 /* vec_any_out (any elements out of bounds)
3434 * ===========
3435 */
3436 static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437 {
3438 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439 }
3440
3441
3442 /* CBE Language Extension Intrinsics
3443 */
3444
3445 /* vec_extract (extract element from vector)
3446 * ===========
3447 */
3448 #define vec_extract(_a, _element) spu_extract(_a, _element)
3449
3450
3451 /* vec_insert (insert scalar into specified vector element)
3452 * ==========
3453 */
3454 #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element)
3455
3456 /* vec_lvlx (load vector left indexed)
3457 * ========
3458 */
3459 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460 {
3461 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463 }
3464
3465 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466 {
3467 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469 }
3470
3471 static inline vec_char16 vec_lvlx(int a, signed char *b)
3472 {
3473 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475 }
3476
3477 static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478 {
3479 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481 }
3482
3483 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484 {
3485 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487 }
3488
3489 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490 {
3491 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493 }
3494
3495 static inline vec_short8 vec_lvlx(int a, signed short *b)
3496 {
3497 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499 }
3500
3501 static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502 {
3503 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505 }
3506
3507 static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508 {
3509 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511 }
3512
3513 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514 {
3515 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517 }
3518
3519 static inline vec_int4 vec_lvlx(int a, signed int *b)
3520 {
3521 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523 }
3524
3525 static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526 {
3527 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529 }
3530
3531 static inline vec_float4 vec_lvlx(int a, float *b)
3532 {
3533 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535 }
3536
3537 static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538 {
3539 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541 }
3542
3543
3544 /* vec_lvlxl (load vector left indexed last)
3545 * =========
3546 */
3547 #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b)
3548
3549
3550 /* vec_lvrx (load vector right indexed)
3551 * ========
3552 */
3553 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554 {
3555 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557 }
3558
3559 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560 {
3561 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563 }
3564
3565 static inline vec_char16 vec_lvrx(int a, signed char *b)
3566 {
3567 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569 }
3570
3571 static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572 {
3573 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575 }
3576
3577 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578 {
3579 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581 }
3582
3583 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584 {
3585 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587 }
3588
3589 static inline vec_short8 vec_lvrx(int a, signed short *b)
3590 {
3591 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593 }
3594
3595 static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596 {
3597 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599 }
3600
3601 static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602 {
3603 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605 }
3606
3607 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608 {
3609 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611 }
3612
3613 static inline vec_int4 vec_lvrx(int a, signed int *b)
3614 {
3615 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617 }
3618
3619 static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620 {
3621 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623 }
3624
3625 static inline vec_float4 vec_lvrx(int a, float *b)
3626 {
3627 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629 }
3630
3631 static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632 {
3633 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635 }
3636
3637
3638
3639 /* vec_lvrxl (load vector right indexed last)
3640 * =========
3641 */
3642 #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b)
3643
3644
3645 /* vec_promote (promote scalar to a vector)
3646 * ===========
3647 */
3648 #define vec_promote(_a, _element) spu_promote(_a, _element)
3649
3650
3651 /* vec_splats (splat scalar to a vector)
3652 * ==========
3653 */
3654 #define vec_splats(_a) spu_splats(_a)
3655
3656
3657 /* vec_stvlx (store vector left indexed)
3658 * =========
3659 */
3660 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661 {
3662 int shift;
3663 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664
3665 shift = -((int)p & 0xF);
3666 *p = spu_sel(*p,
3667 spu_rlmaskqwbyte(a, shift),
3668 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669 }
3670
3671 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672 {
3673 int shift;
3674 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675
3676 shift = -((int)p & 0xF);
3677 *p = spu_sel(*p,
3678 spu_rlmaskqwbyte(a, shift),
3679 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680 }
3681
3682 static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683 {
3684 int shift;
3685 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686
3687 shift = -((int)p & 0xF);
3688 *p = spu_sel(*p,
3689 spu_rlmaskqwbyte(a, shift),
3690 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691 }
3692
3693 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694 {
3695 int shift;
3696 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697
3698 shift = -((int)p & 0xF);
3699 *p = spu_sel(*p,
3700 spu_rlmaskqwbyte(a, shift),
3701 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702 }
3703
3704 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705 {
3706 int shift;
3707 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708
3709 shift = -((int)p & 0xF);
3710 *p = spu_sel(*p,
3711 spu_rlmaskqwbyte(a, shift),
3712 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713 }
3714
3715 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716 {
3717 int shift;
3718 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719
3720 shift = -((int)p & 0xF);
3721 *p = spu_sel(*p,
3722 spu_rlmaskqwbyte(a, shift),
3723 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724 }
3725
3726 static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727 {
3728 int shift;
3729 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730
3731 shift = -((int)p & 0xF);
3732 *p = spu_sel(*p,
3733 spu_rlmaskqwbyte(a, shift),
3734 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735 }
3736
3737 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738 {
3739 int shift;
3740 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741
3742 shift = -((int)p & 0xF);
3743 *p = spu_sel(*p,
3744 spu_rlmaskqwbyte(a, shift),
3745 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746 }
3747
3748 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749 {
3750 int shift;
3751 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752
3753 shift = -((int)p & 0xF);
3754 *p = spu_sel(*p,
3755 spu_rlmaskqwbyte(a, shift),
3756 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757 }
3758
3759 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760 {
3761 int shift;
3762 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763
3764 shift = -((int)p & 0xF);
3765 *p = spu_sel(*p,
3766 spu_rlmaskqwbyte(a, shift),
3767 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768 }
3769
3770 static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771 {
3772 int shift;
3773 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774
3775 shift = -((int)p & 0xF);
3776 *p = spu_sel(*p,
3777 spu_rlmaskqwbyte(a, shift),
3778 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779 }
3780
3781 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782 {
3783 int shift;
3784 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785
3786 shift = -((int)p & 0xF);
3787 *p = spu_sel(*p,
3788 spu_rlmaskqwbyte(a, shift),
3789 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790 }
3791
3792 static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793 {
3794 int shift;
3795 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796
3797 shift = -((int)p & 0xF);
3798 *p = spu_sel(*p,
3799 spu_rlmaskqwbyte(a, shift),
3800 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801 }
3802
3803 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804 {
3805 int shift;
3806 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807
3808 shift = -((int)p & 0xF);
3809 *p = spu_sel(*p,
3810 spu_rlmaskqwbyte(a, shift),
3811 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812 }
3813
3814 /* vec_stvlxl (store vector left indexed last)
3815 * ==========
3816 */
3817 #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c)
3818
3819
3820 /* vec_stvrx (store vector right indexed)
3821 * =========
3822 */
3823 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824 {
3825 int shift;
3826 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827
3828 shift = 16-((int)p & 0xF);
3829 *p = spu_sel(*p,
3830 spu_slqwbyte(a, shift),
3831 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832 }
3833
3834 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835 {
3836 int shift;
3837 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838
3839 shift = 16-((int)p & 0xF);
3840 *p = spu_sel(*p,
3841 spu_slqwbyte(a, shift),
3842 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843 }
3844
3845 static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846 {
3847 int shift;
3848 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849
3850 shift = 16-((int)p & 0xF);
3851 *p = spu_sel(*p,
3852 spu_slqwbyte(a, shift),
3853 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854 }
3855
3856 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857 {
3858 int shift;
3859 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860
3861 shift = 16-((int)p & 0xF);
3862 *p = spu_sel(*p,
3863 spu_slqwbyte(a, shift),
3864 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865 }
3866
3867 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868 {
3869 int shift;
3870 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871
3872 shift = 16-((int)p & 0xF);
3873 *p = spu_sel(*p,
3874 spu_slqwbyte(a, shift),
3875 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876 }
3877
3878 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879 {
3880 int shift;
3881 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882
3883 shift = 16-((int)p & 0xF);
3884 *p = spu_sel(*p,
3885 spu_slqwbyte(a, shift),
3886 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887 }
3888
3889 static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890 {
3891 int shift;
3892 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893
3894 shift = 16-((int)p & 0xF);
3895 *p = spu_sel(*p,
3896 spu_slqwbyte(a, shift),
3897 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898 }
3899
3900 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901 {
3902 int shift;
3903 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904
3905 shift = 16-((int)p & 0xF);
3906 *p = spu_sel(*p,
3907 spu_slqwbyte(a, shift),
3908 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909 }
3910
3911 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912 {
3913 int shift;
3914 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915
3916 shift = 16-((int)p & 0xF);
3917 *p = spu_sel(*p,
3918 spu_slqwbyte(a, shift),
3919 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920 }
3921
3922 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923 {
3924 int shift;
3925 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926
3927 shift = 16-((int)p & 0xF);
3928 *p = spu_sel(*p,
3929 spu_slqwbyte(a, shift),
3930 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931 }
3932
3933 static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934 {
3935 int shift;
3936 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937
3938 shift = 16-((int)p & 0xF);
3939 *p = spu_sel(*p,
3940 spu_slqwbyte(a, shift),
3941 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942 }
3943
3944 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945 {
3946 int shift;
3947 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948
3949 shift = 16-((int)p & 0xF);
3950 *p = spu_sel(*p,
3951 spu_slqwbyte(a, shift),
3952 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953 }
3954
3955 static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956 {
3957 int shift;
3958 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959
3960 shift = 16-((int)p & 0xF);
3961 *p = spu_sel(*p,
3962 spu_slqwbyte(a, shift),
3963 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964 }
3965
3966 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967 {
3968 int shift;
3969 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970
3971 shift = 16-((int)p & 0xF);
3972 *p = spu_sel(*p,
3973 spu_slqwbyte(a, shift),
3974 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975 }
3976
3977 /* vec_stvrxl (store vector right indexed last)
3978 * ==========
3979 */
3980 #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c)
3981
3982
3983 #endif /* __SPU__ */
3984 #endif /* __cplusplus */
3985 #endif /* !_VMX2SPU_H_ */