gallium/tgsi_exec: Fix up NumOutputs counting
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_exec.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 VMware, Inc.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers. This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 * Michal Krol
51 * Brian Paul
52 */
53
54 #include "pipe/p_compiler.h"
55 #include "pipe/p_state.h"
56 #include "pipe/p_shader_tokens.h"
57 #include "tgsi/tgsi_dump.h"
58 #include "tgsi/tgsi_parse.h"
59 #include "tgsi/tgsi_util.h"
60 #include "tgsi_exec.h"
61 #include "util/u_half.h"
62 #include "util/u_memory.h"
63 #include "util/u_math.h"
64 #include "util/rounding.h"
65
66
67 #define DEBUG_EXECUTION 0
68
69
70 #define FAST_MATH 0
71
72 #define TILE_TOP_LEFT 0
73 #define TILE_TOP_RIGHT 1
74 #define TILE_BOTTOM_LEFT 2
75 #define TILE_BOTTOM_RIGHT 3
76
77 union tgsi_double_channel {
78 double d[TGSI_QUAD_SIZE];
79 unsigned u[TGSI_QUAD_SIZE][2];
80 uint64_t u64[TGSI_QUAD_SIZE];
81 int64_t i64[TGSI_QUAD_SIZE];
82 };
83
84 struct tgsi_double_vector {
85 union tgsi_double_channel xy;
86 union tgsi_double_channel zw;
87 };
88
89 static void
90 micro_abs(union tgsi_exec_channel *dst,
91 const union tgsi_exec_channel *src)
92 {
93 dst->f[0] = fabsf(src->f[0]);
94 dst->f[1] = fabsf(src->f[1]);
95 dst->f[2] = fabsf(src->f[2]);
96 dst->f[3] = fabsf(src->f[3]);
97 }
98
99 static void
100 micro_arl(union tgsi_exec_channel *dst,
101 const union tgsi_exec_channel *src)
102 {
103 dst->i[0] = (int)floorf(src->f[0]);
104 dst->i[1] = (int)floorf(src->f[1]);
105 dst->i[2] = (int)floorf(src->f[2]);
106 dst->i[3] = (int)floorf(src->f[3]);
107 }
108
109 static void
110 micro_arr(union tgsi_exec_channel *dst,
111 const union tgsi_exec_channel *src)
112 {
113 dst->i[0] = (int)floorf(src->f[0] + 0.5f);
114 dst->i[1] = (int)floorf(src->f[1] + 0.5f);
115 dst->i[2] = (int)floorf(src->f[2] + 0.5f);
116 dst->i[3] = (int)floorf(src->f[3] + 0.5f);
117 }
118
119 static void
120 micro_ceil(union tgsi_exec_channel *dst,
121 const union tgsi_exec_channel *src)
122 {
123 dst->f[0] = ceilf(src->f[0]);
124 dst->f[1] = ceilf(src->f[1]);
125 dst->f[2] = ceilf(src->f[2]);
126 dst->f[3] = ceilf(src->f[3]);
127 }
128
129 static void
130 micro_cmp(union tgsi_exec_channel *dst,
131 const union tgsi_exec_channel *src0,
132 const union tgsi_exec_channel *src1,
133 const union tgsi_exec_channel *src2)
134 {
135 dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
136 dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
137 dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
138 dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
139 }
140
141 static void
142 micro_cos(union tgsi_exec_channel *dst,
143 const union tgsi_exec_channel *src)
144 {
145 dst->f[0] = cosf(src->f[0]);
146 dst->f[1] = cosf(src->f[1]);
147 dst->f[2] = cosf(src->f[2]);
148 dst->f[3] = cosf(src->f[3]);
149 }
150
151 static void
152 micro_d2f(union tgsi_exec_channel *dst,
153 const union tgsi_double_channel *src)
154 {
155 dst->f[0] = (float)src->d[0];
156 dst->f[1] = (float)src->d[1];
157 dst->f[2] = (float)src->d[2];
158 dst->f[3] = (float)src->d[3];
159 }
160
161 static void
162 micro_d2i(union tgsi_exec_channel *dst,
163 const union tgsi_double_channel *src)
164 {
165 dst->i[0] = (int)src->d[0];
166 dst->i[1] = (int)src->d[1];
167 dst->i[2] = (int)src->d[2];
168 dst->i[3] = (int)src->d[3];
169 }
170
171 static void
172 micro_d2u(union tgsi_exec_channel *dst,
173 const union tgsi_double_channel *src)
174 {
175 dst->u[0] = (unsigned)src->d[0];
176 dst->u[1] = (unsigned)src->d[1];
177 dst->u[2] = (unsigned)src->d[2];
178 dst->u[3] = (unsigned)src->d[3];
179 }
180 static void
181 micro_dabs(union tgsi_double_channel *dst,
182 const union tgsi_double_channel *src)
183 {
184 dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
185 dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
186 dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
187 dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
188 }
189
190 static void
191 micro_dadd(union tgsi_double_channel *dst,
192 const union tgsi_double_channel *src)
193 {
194 dst->d[0] = src[0].d[0] + src[1].d[0];
195 dst->d[1] = src[0].d[1] + src[1].d[1];
196 dst->d[2] = src[0].d[2] + src[1].d[2];
197 dst->d[3] = src[0].d[3] + src[1].d[3];
198 }
199
200 static void
201 micro_ddiv(union tgsi_double_channel *dst,
202 const union tgsi_double_channel *src)
203 {
204 dst->d[0] = src[0].d[0] / src[1].d[0];
205 dst->d[1] = src[0].d[1] / src[1].d[1];
206 dst->d[2] = src[0].d[2] / src[1].d[2];
207 dst->d[3] = src[0].d[3] / src[1].d[3];
208 }
209
210 static void
211 micro_ddx(union tgsi_exec_channel *dst,
212 const union tgsi_exec_channel *src)
213 {
214 dst->f[0] =
215 dst->f[1] =
216 dst->f[2] =
217 dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
218 }
219
220 static void
221 micro_ddx_fine(union tgsi_exec_channel *dst,
222 const union tgsi_exec_channel *src)
223 {
224 dst->f[0] =
225 dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
226 dst->f[2] =
227 dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
228 }
229
230
231 static void
232 micro_ddy(union tgsi_exec_channel *dst,
233 const union tgsi_exec_channel *src)
234 {
235 dst->f[0] =
236 dst->f[1] =
237 dst->f[2] =
238 dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
239 }
240
241 static void
242 micro_ddy_fine(union tgsi_exec_channel *dst,
243 const union tgsi_exec_channel *src)
244 {
245 dst->f[0] =
246 dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
247 dst->f[1] =
248 dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
249 }
250
251 static void
252 micro_dmul(union tgsi_double_channel *dst,
253 const union tgsi_double_channel *src)
254 {
255 dst->d[0] = src[0].d[0] * src[1].d[0];
256 dst->d[1] = src[0].d[1] * src[1].d[1];
257 dst->d[2] = src[0].d[2] * src[1].d[2];
258 dst->d[3] = src[0].d[3] * src[1].d[3];
259 }
260
261 static void
262 micro_dmax(union tgsi_double_channel *dst,
263 const union tgsi_double_channel *src)
264 {
265 dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
266 dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
267 dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
268 dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
269 }
270
271 static void
272 micro_dmin(union tgsi_double_channel *dst,
273 const union tgsi_double_channel *src)
274 {
275 dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
276 dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
277 dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
278 dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
279 }
280
281 static void
282 micro_dneg(union tgsi_double_channel *dst,
283 const union tgsi_double_channel *src)
284 {
285 dst->d[0] = -src->d[0];
286 dst->d[1] = -src->d[1];
287 dst->d[2] = -src->d[2];
288 dst->d[3] = -src->d[3];
289 }
290
291 static void
292 micro_dslt(union tgsi_double_channel *dst,
293 const union tgsi_double_channel *src)
294 {
295 dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
296 dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
297 dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
298 dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
299 }
300
301 static void
302 micro_dsne(union tgsi_double_channel *dst,
303 const union tgsi_double_channel *src)
304 {
305 dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
306 dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
307 dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
308 dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
309 }
310
311 static void
312 micro_dsge(union tgsi_double_channel *dst,
313 const union tgsi_double_channel *src)
314 {
315 dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
316 dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
317 dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
318 dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
319 }
320
321 static void
322 micro_dseq(union tgsi_double_channel *dst,
323 const union tgsi_double_channel *src)
324 {
325 dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
326 dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
327 dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
328 dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
329 }
330
331 static void
332 micro_drcp(union tgsi_double_channel *dst,
333 const union tgsi_double_channel *src)
334 {
335 dst->d[0] = 1.0 / src->d[0];
336 dst->d[1] = 1.0 / src->d[1];
337 dst->d[2] = 1.0 / src->d[2];
338 dst->d[3] = 1.0 / src->d[3];
339 }
340
341 static void
342 micro_dsqrt(union tgsi_double_channel *dst,
343 const union tgsi_double_channel *src)
344 {
345 dst->d[0] = sqrt(src->d[0]);
346 dst->d[1] = sqrt(src->d[1]);
347 dst->d[2] = sqrt(src->d[2]);
348 dst->d[3] = sqrt(src->d[3]);
349 }
350
351 static void
352 micro_drsq(union tgsi_double_channel *dst,
353 const union tgsi_double_channel *src)
354 {
355 dst->d[0] = 1.0 / sqrt(src->d[0]);
356 dst->d[1] = 1.0 / sqrt(src->d[1]);
357 dst->d[2] = 1.0 / sqrt(src->d[2]);
358 dst->d[3] = 1.0 / sqrt(src->d[3]);
359 }
360
361 static void
362 micro_dmad(union tgsi_double_channel *dst,
363 const union tgsi_double_channel *src)
364 {
365 dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
366 dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
367 dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
368 dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
369 }
370
371 static void
372 micro_dfrac(union tgsi_double_channel *dst,
373 const union tgsi_double_channel *src)
374 {
375 dst->d[0] = src->d[0] - floor(src->d[0]);
376 dst->d[1] = src->d[1] - floor(src->d[1]);
377 dst->d[2] = src->d[2] - floor(src->d[2]);
378 dst->d[3] = src->d[3] - floor(src->d[3]);
379 }
380
381 static void
382 micro_dflr(union tgsi_double_channel *dst,
383 const union tgsi_double_channel *src)
384 {
385 dst->d[0] = floor(src->d[0]);
386 dst->d[1] = floor(src->d[1]);
387 dst->d[2] = floor(src->d[2]);
388 dst->d[3] = floor(src->d[3]);
389 }
390
391 static void
392 micro_dldexp(union tgsi_double_channel *dst,
393 const union tgsi_double_channel *src0,
394 union tgsi_exec_channel *src1)
395 {
396 dst->d[0] = ldexp(src0->d[0], src1->i[0]);
397 dst->d[1] = ldexp(src0->d[1], src1->i[1]);
398 dst->d[2] = ldexp(src0->d[2], src1->i[2]);
399 dst->d[3] = ldexp(src0->d[3], src1->i[3]);
400 }
401
402 static void
403 micro_dfracexp(union tgsi_double_channel *dst,
404 union tgsi_exec_channel *dst_exp,
405 const union tgsi_double_channel *src)
406 {
407 dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
408 dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
409 dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
410 dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
411 }
412
413 static void
414 micro_exp2(union tgsi_exec_channel *dst,
415 const union tgsi_exec_channel *src)
416 {
417 #if FAST_MATH
418 dst->f[0] = util_fast_exp2(src->f[0]);
419 dst->f[1] = util_fast_exp2(src->f[1]);
420 dst->f[2] = util_fast_exp2(src->f[2]);
421 dst->f[3] = util_fast_exp2(src->f[3]);
422 #else
423 #if DEBUG
424 /* Inf is okay for this instruction, so clamp it to silence assertions. */
425 uint i;
426 union tgsi_exec_channel clamped;
427
428 for (i = 0; i < 4; i++) {
429 if (src->f[i] > 127.99999f) {
430 clamped.f[i] = 127.99999f;
431 } else if (src->f[i] < -126.99999f) {
432 clamped.f[i] = -126.99999f;
433 } else {
434 clamped.f[i] = src->f[i];
435 }
436 }
437 src = &clamped;
438 #endif /* DEBUG */
439
440 dst->f[0] = powf(2.0f, src->f[0]);
441 dst->f[1] = powf(2.0f, src->f[1]);
442 dst->f[2] = powf(2.0f, src->f[2]);
443 dst->f[3] = powf(2.0f, src->f[3]);
444 #endif /* FAST_MATH */
445 }
446
447 static void
448 micro_f2d(union tgsi_double_channel *dst,
449 const union tgsi_exec_channel *src)
450 {
451 dst->d[0] = (double)src->f[0];
452 dst->d[1] = (double)src->f[1];
453 dst->d[2] = (double)src->f[2];
454 dst->d[3] = (double)src->f[3];
455 }
456
457 static void
458 micro_flr(union tgsi_exec_channel *dst,
459 const union tgsi_exec_channel *src)
460 {
461 dst->f[0] = floorf(src->f[0]);
462 dst->f[1] = floorf(src->f[1]);
463 dst->f[2] = floorf(src->f[2]);
464 dst->f[3] = floorf(src->f[3]);
465 }
466
467 static void
468 micro_frc(union tgsi_exec_channel *dst,
469 const union tgsi_exec_channel *src)
470 {
471 dst->f[0] = src->f[0] - floorf(src->f[0]);
472 dst->f[1] = src->f[1] - floorf(src->f[1]);
473 dst->f[2] = src->f[2] - floorf(src->f[2]);
474 dst->f[3] = src->f[3] - floorf(src->f[3]);
475 }
476
477 static void
478 micro_i2d(union tgsi_double_channel *dst,
479 const union tgsi_exec_channel *src)
480 {
481 dst->d[0] = (double)src->i[0];
482 dst->d[1] = (double)src->i[1];
483 dst->d[2] = (double)src->i[2];
484 dst->d[3] = (double)src->i[3];
485 }
486
487 static void
488 micro_iabs(union tgsi_exec_channel *dst,
489 const union tgsi_exec_channel *src)
490 {
491 dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
492 dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
493 dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
494 dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
495 }
496
497 static void
498 micro_ineg(union tgsi_exec_channel *dst,
499 const union tgsi_exec_channel *src)
500 {
501 dst->i[0] = -src->i[0];
502 dst->i[1] = -src->i[1];
503 dst->i[2] = -src->i[2];
504 dst->i[3] = -src->i[3];
505 }
506
507 static void
508 micro_lg2(union tgsi_exec_channel *dst,
509 const union tgsi_exec_channel *src)
510 {
511 #if FAST_MATH
512 dst->f[0] = util_fast_log2(src->f[0]);
513 dst->f[1] = util_fast_log2(src->f[1]);
514 dst->f[2] = util_fast_log2(src->f[2]);
515 dst->f[3] = util_fast_log2(src->f[3]);
516 #else
517 dst->f[0] = logf(src->f[0]) * 1.442695f;
518 dst->f[1] = logf(src->f[1]) * 1.442695f;
519 dst->f[2] = logf(src->f[2]) * 1.442695f;
520 dst->f[3] = logf(src->f[3]) * 1.442695f;
521 #endif
522 }
523
524 static void
525 micro_lrp(union tgsi_exec_channel *dst,
526 const union tgsi_exec_channel *src0,
527 const union tgsi_exec_channel *src1,
528 const union tgsi_exec_channel *src2)
529 {
530 dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
531 dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
532 dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
533 dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
534 }
535
536 static void
537 micro_mad(union tgsi_exec_channel *dst,
538 const union tgsi_exec_channel *src0,
539 const union tgsi_exec_channel *src1,
540 const union tgsi_exec_channel *src2)
541 {
542 dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
543 dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
544 dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
545 dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
546 }
547
548 static void
549 micro_mov(union tgsi_exec_channel *dst,
550 const union tgsi_exec_channel *src)
551 {
552 dst->u[0] = src->u[0];
553 dst->u[1] = src->u[1];
554 dst->u[2] = src->u[2];
555 dst->u[3] = src->u[3];
556 }
557
558 static void
559 micro_rcp(union tgsi_exec_channel *dst,
560 const union tgsi_exec_channel *src)
561 {
562 #if 0 /* for debugging */
563 assert(src->f[0] != 0.0f);
564 assert(src->f[1] != 0.0f);
565 assert(src->f[2] != 0.0f);
566 assert(src->f[3] != 0.0f);
567 #endif
568 dst->f[0] = 1.0f / src->f[0];
569 dst->f[1] = 1.0f / src->f[1];
570 dst->f[2] = 1.0f / src->f[2];
571 dst->f[3] = 1.0f / src->f[3];
572 }
573
574 static void
575 micro_rnd(union tgsi_exec_channel *dst,
576 const union tgsi_exec_channel *src)
577 {
578 dst->f[0] = _mesa_roundevenf(src->f[0]);
579 dst->f[1] = _mesa_roundevenf(src->f[1]);
580 dst->f[2] = _mesa_roundevenf(src->f[2]);
581 dst->f[3] = _mesa_roundevenf(src->f[3]);
582 }
583
584 static void
585 micro_rsq(union tgsi_exec_channel *dst,
586 const union tgsi_exec_channel *src)
587 {
588 #if 0 /* for debugging */
589 assert(src->f[0] != 0.0f);
590 assert(src->f[1] != 0.0f);
591 assert(src->f[2] != 0.0f);
592 assert(src->f[3] != 0.0f);
593 #endif
594 dst->f[0] = 1.0f / sqrtf(src->f[0]);
595 dst->f[1] = 1.0f / sqrtf(src->f[1]);
596 dst->f[2] = 1.0f / sqrtf(src->f[2]);
597 dst->f[3] = 1.0f / sqrtf(src->f[3]);
598 }
599
600 static void
601 micro_sqrt(union tgsi_exec_channel *dst,
602 const union tgsi_exec_channel *src)
603 {
604 dst->f[0] = sqrtf(src->f[0]);
605 dst->f[1] = sqrtf(src->f[1]);
606 dst->f[2] = sqrtf(src->f[2]);
607 dst->f[3] = sqrtf(src->f[3]);
608 }
609
610 static void
611 micro_seq(union tgsi_exec_channel *dst,
612 const union tgsi_exec_channel *src0,
613 const union tgsi_exec_channel *src1)
614 {
615 dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
616 dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
617 dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
618 dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
619 }
620
621 static void
622 micro_sge(union tgsi_exec_channel *dst,
623 const union tgsi_exec_channel *src0,
624 const union tgsi_exec_channel *src1)
625 {
626 dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
627 dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
628 dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
629 dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
630 }
631
632 static void
633 micro_sgn(union tgsi_exec_channel *dst,
634 const union tgsi_exec_channel *src)
635 {
636 dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
637 dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
638 dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
639 dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
640 }
641
642 static void
643 micro_isgn(union tgsi_exec_channel *dst,
644 const union tgsi_exec_channel *src)
645 {
646 dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
647 dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
648 dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
649 dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
650 }
651
652 static void
653 micro_sgt(union tgsi_exec_channel *dst,
654 const union tgsi_exec_channel *src0,
655 const union tgsi_exec_channel *src1)
656 {
657 dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
658 dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
659 dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
660 dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
661 }
662
663 static void
664 micro_sin(union tgsi_exec_channel *dst,
665 const union tgsi_exec_channel *src)
666 {
667 dst->f[0] = sinf(src->f[0]);
668 dst->f[1] = sinf(src->f[1]);
669 dst->f[2] = sinf(src->f[2]);
670 dst->f[3] = sinf(src->f[3]);
671 }
672
673 static void
674 micro_sle(union tgsi_exec_channel *dst,
675 const union tgsi_exec_channel *src0,
676 const union tgsi_exec_channel *src1)
677 {
678 dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
679 dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
680 dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
681 dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
682 }
683
684 static void
685 micro_slt(union tgsi_exec_channel *dst,
686 const union tgsi_exec_channel *src0,
687 const union tgsi_exec_channel *src1)
688 {
689 dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
690 dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
691 dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
692 dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
693 }
694
695 static void
696 micro_sne(union tgsi_exec_channel *dst,
697 const union tgsi_exec_channel *src0,
698 const union tgsi_exec_channel *src1)
699 {
700 dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
701 dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
702 dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
703 dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
704 }
705
706 static void
707 micro_trunc(union tgsi_exec_channel *dst,
708 const union tgsi_exec_channel *src)
709 {
710 dst->f[0] = truncf(src->f[0]);
711 dst->f[1] = truncf(src->f[1]);
712 dst->f[2] = truncf(src->f[2]);
713 dst->f[3] = truncf(src->f[3]);
714 }
715
716 static void
717 micro_u2d(union tgsi_double_channel *dst,
718 const union tgsi_exec_channel *src)
719 {
720 dst->d[0] = (double)src->u[0];
721 dst->d[1] = (double)src->u[1];
722 dst->d[2] = (double)src->u[2];
723 dst->d[3] = (double)src->u[3];
724 }
725
726 static void
727 micro_i64abs(union tgsi_double_channel *dst,
728 const union tgsi_double_channel *src)
729 {
730 dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
731 dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
732 dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
733 dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
734 }
735
736 static void
737 micro_i64sgn(union tgsi_double_channel *dst,
738 const union tgsi_double_channel *src)
739 {
740 dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
741 dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
742 dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
743 dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
744 }
745
746 static void
747 micro_i64neg(union tgsi_double_channel *dst,
748 const union tgsi_double_channel *src)
749 {
750 dst->i64[0] = -src->i64[0];
751 dst->i64[1] = -src->i64[1];
752 dst->i64[2] = -src->i64[2];
753 dst->i64[3] = -src->i64[3];
754 }
755
756 static void
757 micro_u64seq(union tgsi_double_channel *dst,
758 const union tgsi_double_channel *src)
759 {
760 dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
761 dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
762 dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
763 dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
764 }
765
766 static void
767 micro_u64sne(union tgsi_double_channel *dst,
768 const union tgsi_double_channel *src)
769 {
770 dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
771 dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
772 dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
773 dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
774 }
775
776 static void
777 micro_i64slt(union tgsi_double_channel *dst,
778 const union tgsi_double_channel *src)
779 {
780 dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
781 dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
782 dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
783 dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
784 }
785
786 static void
787 micro_u64slt(union tgsi_double_channel *dst,
788 const union tgsi_double_channel *src)
789 {
790 dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
791 dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
792 dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
793 dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
794 }
795
796 static void
797 micro_i64sge(union tgsi_double_channel *dst,
798 const union tgsi_double_channel *src)
799 {
800 dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
801 dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
802 dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
803 dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
804 }
805
806 static void
807 micro_u64sge(union tgsi_double_channel *dst,
808 const union tgsi_double_channel *src)
809 {
810 dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
811 dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
812 dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
813 dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
814 }
815
816 static void
817 micro_u64max(union tgsi_double_channel *dst,
818 const union tgsi_double_channel *src)
819 {
820 dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
821 dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
822 dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
823 dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
824 }
825
826 static void
827 micro_i64max(union tgsi_double_channel *dst,
828 const union tgsi_double_channel *src)
829 {
830 dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
831 dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
832 dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
833 dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
834 }
835
836 static void
837 micro_u64min(union tgsi_double_channel *dst,
838 const union tgsi_double_channel *src)
839 {
840 dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
841 dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
842 dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
843 dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
844 }
845
846 static void
847 micro_i64min(union tgsi_double_channel *dst,
848 const union tgsi_double_channel *src)
849 {
850 dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
851 dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
852 dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
853 dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
854 }
855
856 static void
857 micro_u64add(union tgsi_double_channel *dst,
858 const union tgsi_double_channel *src)
859 {
860 dst->u64[0] = src[0].u64[0] + src[1].u64[0];
861 dst->u64[1] = src[0].u64[1] + src[1].u64[1];
862 dst->u64[2] = src[0].u64[2] + src[1].u64[2];
863 dst->u64[3] = src[0].u64[3] + src[1].u64[3];
864 }
865
866 static void
867 micro_u64mul(union tgsi_double_channel *dst,
868 const union tgsi_double_channel *src)
869 {
870 dst->u64[0] = src[0].u64[0] * src[1].u64[0];
871 dst->u64[1] = src[0].u64[1] * src[1].u64[1];
872 dst->u64[2] = src[0].u64[2] * src[1].u64[2];
873 dst->u64[3] = src[0].u64[3] * src[1].u64[3];
874 }
875
876 static void
877 micro_u64div(union tgsi_double_channel *dst,
878 const union tgsi_double_channel *src)
879 {
880 dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
881 dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
882 dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
883 dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
884 }
885
886 static void
887 micro_i64div(union tgsi_double_channel *dst,
888 const union tgsi_double_channel *src)
889 {
890 dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
891 dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
892 dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
893 dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
894 }
895
896 static void
897 micro_u64mod(union tgsi_double_channel *dst,
898 const union tgsi_double_channel *src)
899 {
900 dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
901 dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
902 dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
903 dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
904 }
905
906 static void
907 micro_i64mod(union tgsi_double_channel *dst,
908 const union tgsi_double_channel *src)
909 {
910 dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
911 dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
912 dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
913 dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
914 }
915
916 static void
917 micro_u64shl(union tgsi_double_channel *dst,
918 const union tgsi_double_channel *src0,
919 union tgsi_exec_channel *src1)
920 {
921 unsigned masked_count;
922 masked_count = src1->u[0] & 0x3f;
923 dst->u64[0] = src0->u64[0] << masked_count;
924 masked_count = src1->u[1] & 0x3f;
925 dst->u64[1] = src0->u64[1] << masked_count;
926 masked_count = src1->u[2] & 0x3f;
927 dst->u64[2] = src0->u64[2] << masked_count;
928 masked_count = src1->u[3] & 0x3f;
929 dst->u64[3] = src0->u64[3] << masked_count;
930 }
931
932 static void
933 micro_i64shr(union tgsi_double_channel *dst,
934 const union tgsi_double_channel *src0,
935 union tgsi_exec_channel *src1)
936 {
937 unsigned masked_count;
938 masked_count = src1->u[0] & 0x3f;
939 dst->i64[0] = src0->i64[0] >> masked_count;
940 masked_count = src1->u[1] & 0x3f;
941 dst->i64[1] = src0->i64[1] >> masked_count;
942 masked_count = src1->u[2] & 0x3f;
943 dst->i64[2] = src0->i64[2] >> masked_count;
944 masked_count = src1->u[3] & 0x3f;
945 dst->i64[3] = src0->i64[3] >> masked_count;
946 }
947
948 static void
949 micro_u64shr(union tgsi_double_channel *dst,
950 const union tgsi_double_channel *src0,
951 union tgsi_exec_channel *src1)
952 {
953 unsigned masked_count;
954 masked_count = src1->u[0] & 0x3f;
955 dst->u64[0] = src0->u64[0] >> masked_count;
956 masked_count = src1->u[1] & 0x3f;
957 dst->u64[1] = src0->u64[1] >> masked_count;
958 masked_count = src1->u[2] & 0x3f;
959 dst->u64[2] = src0->u64[2] >> masked_count;
960 masked_count = src1->u[3] & 0x3f;
961 dst->u64[3] = src0->u64[3] >> masked_count;
962 }
963
964 enum tgsi_exec_datatype {
965 TGSI_EXEC_DATA_FLOAT,
966 TGSI_EXEC_DATA_INT,
967 TGSI_EXEC_DATA_UINT,
968 TGSI_EXEC_DATA_DOUBLE,
969 TGSI_EXEC_DATA_INT64,
970 TGSI_EXEC_DATA_UINT64,
971 };
972
973 /*
974 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
975 */
976 #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
977 #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
978 #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
979 #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
980 #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
981 #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
982 #define TEMP_PRIMITIVE_S1_I TGSI_EXEC_TEMP_PRIMITIVE_S1_I
983 #define TEMP_PRIMITIVE_S1_C TGSI_EXEC_TEMP_PRIMITIVE_S1_C
984 #define TEMP_PRIMITIVE_S2_I TGSI_EXEC_TEMP_PRIMITIVE_S2_I
985 #define TEMP_PRIMITIVE_S2_C TGSI_EXEC_TEMP_PRIMITIVE_S2_C
986 #define TEMP_PRIMITIVE_S3_I TGSI_EXEC_TEMP_PRIMITIVE_S3_I
987 #define TEMP_PRIMITIVE_S3_C TGSI_EXEC_TEMP_PRIMITIVE_S3_C
988
989 static const struct {
990 int idx;
991 int chan;
992 } temp_prim_idxs[] = {
993 { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
994 { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
995 { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
996 { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
997 };
998
999 /** The execution mask depends on the conditional mask and the loop mask */
1000 #define UPDATE_EXEC_MASK(MACH) \
1001 MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
1002
1003
1004 static const union tgsi_exec_channel ZeroVec =
1005 { { 0.0, 0.0, 0.0, 0.0 } };
1006
1007 static const union tgsi_exec_channel OneVec = {
1008 {1.0f, 1.0f, 1.0f, 1.0f}
1009 };
1010
1011 static const union tgsi_exec_channel P128Vec = {
1012 {128.0f, 128.0f, 128.0f, 128.0f}
1013 };
1014
1015 static const union tgsi_exec_channel M128Vec = {
1016 {-128.0f, -128.0f, -128.0f, -128.0f}
1017 };
1018
1019
1020 /**
1021 * Assert that none of the float values in 'chan' are infinite or NaN.
1022 * NaN and Inf may occur normally during program execution and should
1023 * not lead to crashes, etc. But when debugging, it's helpful to catch
1024 * them.
1025 */
1026 static inline void
1027 check_inf_or_nan(const union tgsi_exec_channel *chan)
1028 {
1029 assert(!util_is_inf_or_nan((chan)->f[0]));
1030 assert(!util_is_inf_or_nan((chan)->f[1]));
1031 assert(!util_is_inf_or_nan((chan)->f[2]));
1032 assert(!util_is_inf_or_nan((chan)->f[3]));
1033 }
1034
1035
1036 #ifdef DEBUG
1037 static void
1038 print_chan(const char *msg, const union tgsi_exec_channel *chan)
1039 {
1040 debug_printf("%s = {%f, %f, %f, %f}\n",
1041 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1042 }
1043 #endif
1044
1045
1046 #ifdef DEBUG
1047 static void
1048 print_temp(const struct tgsi_exec_machine *mach, uint index)
1049 {
1050 const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1051 int i;
1052 debug_printf("Temp[%u] =\n", index);
1053 for (i = 0; i < 4; i++) {
1054 debug_printf(" %c: { %f, %f, %f, %f }\n",
1055 "XYZW"[i],
1056 tmp->xyzw[i].f[0],
1057 tmp->xyzw[i].f[1],
1058 tmp->xyzw[i].f[2],
1059 tmp->xyzw[i].f[3]);
1060 }
1061 }
1062 #endif
1063
1064
1065 void
1066 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1067 unsigned num_bufs,
1068 const void **bufs,
1069 const unsigned *buf_sizes)
1070 {
1071 unsigned i;
1072
1073 for (i = 0; i < num_bufs; i++) {
1074 mach->Consts[i] = bufs[i];
1075 mach->ConstsSize[i] = buf_sizes[i];
1076 }
1077 }
1078
1079 /**
1080 * Initialize machine state by expanding tokens to full instructions,
1081 * allocating temporary storage, setting up constants, etc.
1082 * After this, we can call tgsi_exec_machine_run() many times.
1083 */
1084 void
1085 tgsi_exec_machine_bind_shader(
1086 struct tgsi_exec_machine *mach,
1087 const struct tgsi_token *tokens,
1088 struct tgsi_sampler *sampler,
1089 struct tgsi_image *image,
1090 struct tgsi_buffer *buffer)
1091 {
1092 uint k;
1093 struct tgsi_parse_context parse;
1094 struct tgsi_full_instruction *instructions;
1095 struct tgsi_full_declaration *declarations;
1096 uint maxInstructions = 10, numInstructions = 0;
1097 uint maxDeclarations = 10, numDeclarations = 0;
1098
1099 #if 0
1100 tgsi_dump(tokens, 0);
1101 #endif
1102
1103 util_init_math();
1104
1105
1106 mach->Tokens = tokens;
1107 mach->Sampler = sampler;
1108 mach->Image = image;
1109 mach->Buffer = buffer;
1110
1111 if (!tokens) {
1112 /* unbind and free all */
1113 FREE(mach->Declarations);
1114 mach->Declarations = NULL;
1115 mach->NumDeclarations = 0;
1116
1117 FREE(mach->Instructions);
1118 mach->Instructions = NULL;
1119 mach->NumInstructions = 0;
1120
1121 return;
1122 }
1123
1124 k = tgsi_parse_init (&parse, mach->Tokens);
1125 if (k != TGSI_PARSE_OK) {
1126 debug_printf( "Problem parsing!\n" );
1127 return;
1128 }
1129
1130 mach->ImmLimit = 0;
1131 mach->NumOutputs = 0;
1132
1133 for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1134 mach->SysSemanticToIndex[k] = -1;
1135
1136 if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1137 !mach->UsedGeometryShader) {
1138 struct tgsi_exec_vector *inputs;
1139 struct tgsi_exec_vector *outputs;
1140
1141 inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1142 TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1143 16);
1144
1145 if (!inputs)
1146 return;
1147
1148 outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1149 TGSI_MAX_TOTAL_VERTICES, 16);
1150
1151 if (!outputs) {
1152 align_free(inputs);
1153 return;
1154 }
1155
1156 align_free(mach->Inputs);
1157 align_free(mach->Outputs);
1158
1159 mach->Inputs = inputs;
1160 mach->Outputs = outputs;
1161 mach->UsedGeometryShader = TRUE;
1162 }
1163
1164 declarations = (struct tgsi_full_declaration *)
1165 MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1166
1167 if (!declarations) {
1168 return;
1169 }
1170
1171 instructions = (struct tgsi_full_instruction *)
1172 MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1173
1174 if (!instructions) {
1175 FREE( declarations );
1176 return;
1177 }
1178
1179 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1180 uint i;
1181
1182 tgsi_parse_token( &parse );
1183 switch( parse.FullToken.Token.Type ) {
1184 case TGSI_TOKEN_TYPE_DECLARATION:
1185 /* save expanded declaration */
1186 if (numDeclarations == maxDeclarations) {
1187 declarations = REALLOC(declarations,
1188 maxDeclarations
1189 * sizeof(struct tgsi_full_declaration),
1190 (maxDeclarations + 10)
1191 * sizeof(struct tgsi_full_declaration));
1192 maxDeclarations += 10;
1193 }
1194 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT)
1195 mach->NumOutputs = MAX2(mach->NumOutputs, parse.FullToken.FullDeclaration.Range.Last + 1);
1196 else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1197 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1198 mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1199 }
1200
1201 memcpy(declarations + numDeclarations,
1202 &parse.FullToken.FullDeclaration,
1203 sizeof(declarations[0]));
1204 numDeclarations++;
1205 break;
1206
1207 case TGSI_TOKEN_TYPE_IMMEDIATE:
1208 {
1209 uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1210 assert( size <= 4 );
1211 if (mach->ImmLimit >= mach->ImmsReserved) {
1212 unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1213 float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1214 if (imms) {
1215 mach->ImmsReserved = newReserved;
1216 mach->Imms = imms;
1217 } else {
1218 debug_printf("Unable to (re)allocate space for immidiate constants\n");
1219 break;
1220 }
1221 }
1222
1223 for( i = 0; i < size; i++ ) {
1224 mach->Imms[mach->ImmLimit][i] =
1225 parse.FullToken.FullImmediate.u[i].Float;
1226 }
1227 mach->ImmLimit += 1;
1228 }
1229 break;
1230
1231 case TGSI_TOKEN_TYPE_INSTRUCTION:
1232
1233 /* save expanded instruction */
1234 if (numInstructions == maxInstructions) {
1235 instructions = REALLOC(instructions,
1236 maxInstructions
1237 * sizeof(struct tgsi_full_instruction),
1238 (maxInstructions + 10)
1239 * sizeof(struct tgsi_full_instruction));
1240 maxInstructions += 10;
1241 }
1242
1243 memcpy(instructions + numInstructions,
1244 &parse.FullToken.FullInstruction,
1245 sizeof(instructions[0]));
1246
1247 numInstructions++;
1248 break;
1249
1250 case TGSI_TOKEN_TYPE_PROPERTY:
1251 if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1252 if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1253 mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1254 }
1255 }
1256 break;
1257
1258 default:
1259 assert( 0 );
1260 }
1261 }
1262 tgsi_parse_free (&parse);
1263
1264 FREE(mach->Declarations);
1265 mach->Declarations = declarations;
1266 mach->NumDeclarations = numDeclarations;
1267
1268 FREE(mach->Instructions);
1269 mach->Instructions = instructions;
1270 mach->NumInstructions = numInstructions;
1271 }
1272
1273
1274 struct tgsi_exec_machine *
1275 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1276 {
1277 struct tgsi_exec_machine *mach;
1278
1279 mach = align_malloc( sizeof *mach, 16 );
1280 if (!mach)
1281 goto fail;
1282
1283 memset(mach, 0, sizeof(*mach));
1284
1285 mach->ShaderType = shader_type;
1286 mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1287 mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1288
1289 if (shader_type != PIPE_SHADER_COMPUTE) {
1290 mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1291 mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1292 if (!mach->Inputs || !mach->Outputs)
1293 goto fail;
1294 }
1295
1296 if (shader_type == PIPE_SHADER_FRAGMENT) {
1297 mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1298 if (!mach->InputSampleOffsetApply)
1299 goto fail;
1300 }
1301
1302 #ifdef DEBUG
1303 /* silence warnings */
1304 (void) print_chan;
1305 (void) print_temp;
1306 #endif
1307
1308 return mach;
1309
1310 fail:
1311 if (mach) {
1312 align_free(mach->InputSampleOffsetApply);
1313 align_free(mach->Inputs);
1314 align_free(mach->Outputs);
1315 align_free(mach);
1316 }
1317 return NULL;
1318 }
1319
1320
1321 void
1322 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1323 {
1324 if (mach) {
1325 FREE(mach->Instructions);
1326 FREE(mach->Declarations);
1327 FREE(mach->Imms);
1328
1329 align_free(mach->InputSampleOffsetApply);
1330 align_free(mach->Inputs);
1331 align_free(mach->Outputs);
1332
1333 align_free(mach);
1334 }
1335 }
1336
1337 static void
1338 micro_add(union tgsi_exec_channel *dst,
1339 const union tgsi_exec_channel *src0,
1340 const union tgsi_exec_channel *src1)
1341 {
1342 dst->f[0] = src0->f[0] + src1->f[0];
1343 dst->f[1] = src0->f[1] + src1->f[1];
1344 dst->f[2] = src0->f[2] + src1->f[2];
1345 dst->f[3] = src0->f[3] + src1->f[3];
1346 }
1347
1348 static void
1349 micro_div(
1350 union tgsi_exec_channel *dst,
1351 const union tgsi_exec_channel *src0,
1352 const union tgsi_exec_channel *src1 )
1353 {
1354 if (src1->f[0] != 0) {
1355 dst->f[0] = src0->f[0] / src1->f[0];
1356 }
1357 if (src1->f[1] != 0) {
1358 dst->f[1] = src0->f[1] / src1->f[1];
1359 }
1360 if (src1->f[2] != 0) {
1361 dst->f[2] = src0->f[2] / src1->f[2];
1362 }
1363 if (src1->f[3] != 0) {
1364 dst->f[3] = src0->f[3] / src1->f[3];
1365 }
1366 }
1367
1368 static void
1369 micro_lt(
1370 union tgsi_exec_channel *dst,
1371 const union tgsi_exec_channel *src0,
1372 const union tgsi_exec_channel *src1,
1373 const union tgsi_exec_channel *src2,
1374 const union tgsi_exec_channel *src3 )
1375 {
1376 dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1377 dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1378 dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1379 dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1380 }
1381
1382 static void
1383 micro_max(union tgsi_exec_channel *dst,
1384 const union tgsi_exec_channel *src0,
1385 const union tgsi_exec_channel *src1)
1386 {
1387 dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1388 dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1389 dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1390 dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1391 }
1392
1393 static void
1394 micro_min(union tgsi_exec_channel *dst,
1395 const union tgsi_exec_channel *src0,
1396 const union tgsi_exec_channel *src1)
1397 {
1398 dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1399 dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1400 dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1401 dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1402 }
1403
1404 static void
1405 micro_mul(union tgsi_exec_channel *dst,
1406 const union tgsi_exec_channel *src0,
1407 const union tgsi_exec_channel *src1)
1408 {
1409 dst->f[0] = src0->f[0] * src1->f[0];
1410 dst->f[1] = src0->f[1] * src1->f[1];
1411 dst->f[2] = src0->f[2] * src1->f[2];
1412 dst->f[3] = src0->f[3] * src1->f[3];
1413 }
1414
1415 static void
1416 micro_neg(
1417 union tgsi_exec_channel *dst,
1418 const union tgsi_exec_channel *src )
1419 {
1420 dst->f[0] = -src->f[0];
1421 dst->f[1] = -src->f[1];
1422 dst->f[2] = -src->f[2];
1423 dst->f[3] = -src->f[3];
1424 }
1425
1426 static void
1427 micro_pow(
1428 union tgsi_exec_channel *dst,
1429 const union tgsi_exec_channel *src0,
1430 const union tgsi_exec_channel *src1 )
1431 {
1432 #if FAST_MATH
1433 dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1434 dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1435 dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1436 dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1437 #else
1438 dst->f[0] = powf( src0->f[0], src1->f[0] );
1439 dst->f[1] = powf( src0->f[1], src1->f[1] );
1440 dst->f[2] = powf( src0->f[2], src1->f[2] );
1441 dst->f[3] = powf( src0->f[3], src1->f[3] );
1442 #endif
1443 }
1444
1445 static void
1446 micro_ldexp(union tgsi_exec_channel *dst,
1447 const union tgsi_exec_channel *src0,
1448 const union tgsi_exec_channel *src1)
1449 {
1450 dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1451 dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1452 dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1453 dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1454 }
1455
1456 static void
1457 micro_sub(union tgsi_exec_channel *dst,
1458 const union tgsi_exec_channel *src0,
1459 const union tgsi_exec_channel *src1)
1460 {
1461 dst->f[0] = src0->f[0] - src1->f[0];
1462 dst->f[1] = src0->f[1] - src1->f[1];
1463 dst->f[2] = src0->f[2] - src1->f[2];
1464 dst->f[3] = src0->f[3] - src1->f[3];
1465 }
1466
1467 static void
1468 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1469 const uint file,
1470 const uint swizzle,
1471 const union tgsi_exec_channel *index,
1472 const union tgsi_exec_channel *index2D,
1473 union tgsi_exec_channel *chan)
1474 {
1475 uint i;
1476
1477 assert(swizzle < 4);
1478
1479 switch (file) {
1480 case TGSI_FILE_CONSTANT:
1481 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1482 assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1483 assert(mach->Consts[index2D->i[i]]);
1484
1485 if (index->i[i] < 0) {
1486 chan->u[i] = 0;
1487 } else {
1488 /* NOTE: copying the const value as a uint instead of float */
1489 const uint constbuf = index2D->i[i];
1490 const uint *buf = (const uint *)mach->Consts[constbuf];
1491 const int pos = index->i[i] * 4 + swizzle;
1492 /* const buffer bounds check */
1493 if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1494 if (0) {
1495 /* Debug: print warning */
1496 static int count = 0;
1497 if (count++ < 100)
1498 debug_printf("TGSI Exec: const buffer index %d"
1499 " out of bounds\n", pos);
1500 }
1501 chan->u[i] = 0;
1502 }
1503 else
1504 chan->u[i] = buf[pos];
1505 }
1506 }
1507 break;
1508
1509 case TGSI_FILE_INPUT:
1510 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1511 /*
1512 if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1513 debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1514 index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1515 index2D->i[i], index->i[i]);
1516 }*/
1517 int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1518 assert(pos >= 0);
1519 assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1520 chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1521 }
1522 break;
1523
1524 case TGSI_FILE_SYSTEM_VALUE:
1525 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1526 chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1527 }
1528 break;
1529
1530 case TGSI_FILE_TEMPORARY:
1531 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1532 assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1533 assert(index2D->i[i] == 0);
1534
1535 chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1536 }
1537 break;
1538
1539 case TGSI_FILE_IMMEDIATE:
1540 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1541 assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1542 assert(index2D->i[i] == 0);
1543
1544 chan->f[i] = mach->Imms[index->i[i]][swizzle];
1545 }
1546 break;
1547
1548 case TGSI_FILE_ADDRESS:
1549 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1550 assert(index->i[i] >= 0);
1551 assert(index2D->i[i] == 0);
1552
1553 chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1554 }
1555 break;
1556
1557 case TGSI_FILE_OUTPUT:
1558 /* vertex/fragment output vars can be read too */
1559 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1560 assert(index->i[i] >= 0);
1561 assert(index2D->i[i] == 0);
1562
1563 chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1564 }
1565 break;
1566
1567 default:
1568 assert(0);
1569 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1570 chan->u[i] = 0;
1571 }
1572 }
1573 }
1574
1575 static void
1576 get_index_registers(const struct tgsi_exec_machine *mach,
1577 const struct tgsi_full_src_register *reg,
1578 union tgsi_exec_channel *index,
1579 union tgsi_exec_channel *index2D)
1580 {
1581 uint swizzle;
1582
1583 /* We start with a direct index into a register file.
1584 *
1585 * file[1],
1586 * where:
1587 * file = Register.File
1588 * [1] = Register.Index
1589 */
1590 index->i[0] =
1591 index->i[1] =
1592 index->i[2] =
1593 index->i[3] = reg->Register.Index;
1594
1595 /* There is an extra source register that indirectly subscripts
1596 * a register file. The direct index now becomes an offset
1597 * that is being added to the indirect register.
1598 *
1599 * file[ind[2].x+1],
1600 * where:
1601 * ind = Indirect.File
1602 * [2] = Indirect.Index
1603 * .x = Indirect.SwizzleX
1604 */
1605 if (reg->Register.Indirect) {
1606 union tgsi_exec_channel index2;
1607 union tgsi_exec_channel indir_index;
1608 const uint execmask = mach->ExecMask;
1609 uint i;
1610
1611 /* which address register (always zero now) */
1612 index2.i[0] =
1613 index2.i[1] =
1614 index2.i[2] =
1615 index2.i[3] = reg->Indirect.Index;
1616 /* get current value of address register[swizzle] */
1617 swizzle = reg->Indirect.Swizzle;
1618 fetch_src_file_channel(mach,
1619 reg->Indirect.File,
1620 swizzle,
1621 &index2,
1622 &ZeroVec,
1623 &indir_index);
1624
1625 /* add value of address register to the offset */
1626 index->i[0] += indir_index.i[0];
1627 index->i[1] += indir_index.i[1];
1628 index->i[2] += indir_index.i[2];
1629 index->i[3] += indir_index.i[3];
1630
1631 /* for disabled execution channels, zero-out the index to
1632 * avoid using a potential garbage value.
1633 */
1634 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1635 if ((execmask & (1 << i)) == 0)
1636 index->i[i] = 0;
1637 }
1638 }
1639
1640 /* There is an extra source register that is a second
1641 * subscript to a register file. Effectively it means that
1642 * the register file is actually a 2D array of registers.
1643 *
1644 * file[3][1],
1645 * where:
1646 * [3] = Dimension.Index
1647 */
1648 if (reg->Register.Dimension) {
1649 index2D->i[0] =
1650 index2D->i[1] =
1651 index2D->i[2] =
1652 index2D->i[3] = reg->Dimension.Index;
1653
1654 /* Again, the second subscript index can be addressed indirectly
1655 * identically to the first one.
1656 * Nothing stops us from indirectly addressing the indirect register,
1657 * but there is no need for that, so we won't exercise it.
1658 *
1659 * file[ind[4].y+3][1],
1660 * where:
1661 * ind = DimIndirect.File
1662 * [4] = DimIndirect.Index
1663 * .y = DimIndirect.SwizzleX
1664 */
1665 if (reg->Dimension.Indirect) {
1666 union tgsi_exec_channel index2;
1667 union tgsi_exec_channel indir_index;
1668 const uint execmask = mach->ExecMask;
1669 uint i;
1670
1671 index2.i[0] =
1672 index2.i[1] =
1673 index2.i[2] =
1674 index2.i[3] = reg->DimIndirect.Index;
1675
1676 swizzle = reg->DimIndirect.Swizzle;
1677 fetch_src_file_channel(mach,
1678 reg->DimIndirect.File,
1679 swizzle,
1680 &index2,
1681 &ZeroVec,
1682 &indir_index);
1683
1684 index2D->i[0] += indir_index.i[0];
1685 index2D->i[1] += indir_index.i[1];
1686 index2D->i[2] += indir_index.i[2];
1687 index2D->i[3] += indir_index.i[3];
1688
1689 /* for disabled execution channels, zero-out the index to
1690 * avoid using a potential garbage value.
1691 */
1692 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1693 if ((execmask & (1 << i)) == 0) {
1694 index2D->i[i] = 0;
1695 }
1696 }
1697 }
1698
1699 /* If by any chance there was a need for a 3D array of register
1700 * files, we would have to check whether Dimension is followed
1701 * by a dimension register and continue the saga.
1702 */
1703 } else {
1704 index2D->i[0] =
1705 index2D->i[1] =
1706 index2D->i[2] =
1707 index2D->i[3] = 0;
1708 }
1709 }
1710
1711
1712 static void
1713 fetch_source_d(const struct tgsi_exec_machine *mach,
1714 union tgsi_exec_channel *chan,
1715 const struct tgsi_full_src_register *reg,
1716 const uint chan_index)
1717 {
1718 union tgsi_exec_channel index;
1719 union tgsi_exec_channel index2D;
1720 uint swizzle;
1721
1722 get_index_registers(mach, reg, &index, &index2D);
1723
1724
1725 swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1726 fetch_src_file_channel(mach,
1727 reg->Register.File,
1728 swizzle,
1729 &index,
1730 &index2D,
1731 chan);
1732 }
1733
1734 static void
1735 fetch_source(const struct tgsi_exec_machine *mach,
1736 union tgsi_exec_channel *chan,
1737 const struct tgsi_full_src_register *reg,
1738 const uint chan_index,
1739 enum tgsi_exec_datatype src_datatype)
1740 {
1741 fetch_source_d(mach, chan, reg, chan_index);
1742
1743 if (reg->Register.Absolute) {
1744 if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1745 micro_abs(chan, chan);
1746 } else {
1747 micro_iabs(chan, chan);
1748 }
1749 }
1750
1751 if (reg->Register.Negate) {
1752 if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1753 micro_neg(chan, chan);
1754 } else {
1755 micro_ineg(chan, chan);
1756 }
1757 }
1758 }
1759
1760 static union tgsi_exec_channel *
1761 store_dest_dstret(struct tgsi_exec_machine *mach,
1762 const union tgsi_exec_channel *chan,
1763 const struct tgsi_full_dst_register *reg,
1764 uint chan_index,
1765 enum tgsi_exec_datatype dst_datatype)
1766 {
1767 static union tgsi_exec_channel null;
1768 union tgsi_exec_channel *dst;
1769 union tgsi_exec_channel index2D;
1770 int offset = 0; /* indirection offset */
1771 int index;
1772
1773 /* for debugging */
1774 if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1775 check_inf_or_nan(chan);
1776 }
1777
1778 /* There is an extra source register that indirectly subscripts
1779 * a register file. The direct index now becomes an offset
1780 * that is being added to the indirect register.
1781 *
1782 * file[ind[2].x+1],
1783 * where:
1784 * ind = Indirect.File
1785 * [2] = Indirect.Index
1786 * .x = Indirect.SwizzleX
1787 */
1788 if (reg->Register.Indirect) {
1789 union tgsi_exec_channel index;
1790 union tgsi_exec_channel indir_index;
1791 uint swizzle;
1792
1793 /* which address register (always zero for now) */
1794 index.i[0] =
1795 index.i[1] =
1796 index.i[2] =
1797 index.i[3] = reg->Indirect.Index;
1798
1799 /* get current value of address register[swizzle] */
1800 swizzle = reg->Indirect.Swizzle;
1801
1802 /* fetch values from the address/indirection register */
1803 fetch_src_file_channel(mach,
1804 reg->Indirect.File,
1805 swizzle,
1806 &index,
1807 &ZeroVec,
1808 &indir_index);
1809
1810 /* save indirection offset */
1811 offset = indir_index.i[0];
1812 }
1813
1814 /* There is an extra source register that is a second
1815 * subscript to a register file. Effectively it means that
1816 * the register file is actually a 2D array of registers.
1817 *
1818 * file[3][1],
1819 * where:
1820 * [3] = Dimension.Index
1821 */
1822 if (reg->Register.Dimension) {
1823 index2D.i[0] =
1824 index2D.i[1] =
1825 index2D.i[2] =
1826 index2D.i[3] = reg->Dimension.Index;
1827
1828 /* Again, the second subscript index can be addressed indirectly
1829 * identically to the first one.
1830 * Nothing stops us from indirectly addressing the indirect register,
1831 * but there is no need for that, so we won't exercise it.
1832 *
1833 * file[ind[4].y+3][1],
1834 * where:
1835 * ind = DimIndirect.File
1836 * [4] = DimIndirect.Index
1837 * .y = DimIndirect.SwizzleX
1838 */
1839 if (reg->Dimension.Indirect) {
1840 union tgsi_exec_channel index2;
1841 union tgsi_exec_channel indir_index;
1842 const uint execmask = mach->ExecMask;
1843 unsigned swizzle;
1844 uint i;
1845
1846 index2.i[0] =
1847 index2.i[1] =
1848 index2.i[2] =
1849 index2.i[3] = reg->DimIndirect.Index;
1850
1851 swizzle = reg->DimIndirect.Swizzle;
1852 fetch_src_file_channel(mach,
1853 reg->DimIndirect.File,
1854 swizzle,
1855 &index2,
1856 &ZeroVec,
1857 &indir_index);
1858
1859 index2D.i[0] += indir_index.i[0];
1860 index2D.i[1] += indir_index.i[1];
1861 index2D.i[2] += indir_index.i[2];
1862 index2D.i[3] += indir_index.i[3];
1863
1864 /* for disabled execution channels, zero-out the index to
1865 * avoid using a potential garbage value.
1866 */
1867 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1868 if ((execmask & (1 << i)) == 0) {
1869 index2D.i[i] = 0;
1870 }
1871 }
1872 }
1873
1874 /* If by any chance there was a need for a 3D array of register
1875 * files, we would have to check whether Dimension is followed
1876 * by a dimension register and continue the saga.
1877 */
1878 } else {
1879 index2D.i[0] =
1880 index2D.i[1] =
1881 index2D.i[2] =
1882 index2D.i[3] = 0;
1883 }
1884
1885 switch (reg->Register.File) {
1886 case TGSI_FILE_NULL:
1887 dst = &null;
1888 break;
1889
1890 case TGSI_FILE_OUTPUT:
1891 index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1892 + reg->Register.Index;
1893 dst = &mach->Outputs[offset + index].xyzw[chan_index];
1894 #if 0
1895 debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1896 mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1897 reg->Register.Index);
1898 if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1899 debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1900 for (i = 0; i < TGSI_QUAD_SIZE; i++)
1901 if (execmask & (1 << i))
1902 debug_printf("%f, ", chan->f[i]);
1903 debug_printf(")\n");
1904 }
1905 #endif
1906 break;
1907
1908 case TGSI_FILE_TEMPORARY:
1909 index = reg->Register.Index;
1910 assert( index < TGSI_EXEC_NUM_TEMPS );
1911 dst = &mach->Temps[offset + index].xyzw[chan_index];
1912 break;
1913
1914 case TGSI_FILE_ADDRESS:
1915 index = reg->Register.Index;
1916 dst = &mach->Addrs[index].xyzw[chan_index];
1917 break;
1918
1919 default:
1920 assert( 0 );
1921 return NULL;
1922 }
1923
1924 return dst;
1925 }
1926
1927 static void
1928 store_dest_double(struct tgsi_exec_machine *mach,
1929 const union tgsi_exec_channel *chan,
1930 const struct tgsi_full_dst_register *reg,
1931 uint chan_index,
1932 enum tgsi_exec_datatype dst_datatype)
1933 {
1934 union tgsi_exec_channel *dst;
1935 const uint execmask = mach->ExecMask;
1936 int i;
1937
1938 dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1939 if (!dst)
1940 return;
1941
1942 /* doubles path */
1943 for (i = 0; i < TGSI_QUAD_SIZE; i++)
1944 if (execmask & (1 << i))
1945 dst->i[i] = chan->i[i];
1946 }
1947
1948 static void
1949 store_dest(struct tgsi_exec_machine *mach,
1950 const union tgsi_exec_channel *chan,
1951 const struct tgsi_full_dst_register *reg,
1952 const struct tgsi_full_instruction *inst,
1953 uint chan_index,
1954 enum tgsi_exec_datatype dst_datatype)
1955 {
1956 union tgsi_exec_channel *dst;
1957 const uint execmask = mach->ExecMask;
1958 int i;
1959
1960 dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1961 if (!dst)
1962 return;
1963
1964 if (!inst->Instruction.Saturate) {
1965 for (i = 0; i < TGSI_QUAD_SIZE; i++)
1966 if (execmask & (1 << i))
1967 dst->i[i] = chan->i[i];
1968 }
1969 else {
1970 for (i = 0; i < TGSI_QUAD_SIZE; i++)
1971 if (execmask & (1 << i)) {
1972 if (chan->f[i] < 0.0f)
1973 dst->f[i] = 0.0f;
1974 else if (chan->f[i] > 1.0f)
1975 dst->f[i] = 1.0f;
1976 else
1977 dst->i[i] = chan->i[i];
1978 }
1979 }
1980 }
1981
1982 #define FETCH(VAL,INDEX,CHAN)\
1983 fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1984
1985 #define IFETCH(VAL,INDEX,CHAN)\
1986 fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1987
1988
1989 /**
1990 * Execute ARB-style KIL which is predicated by a src register.
1991 * Kill fragment if any of the four values is less than zero.
1992 */
1993 static void
1994 exec_kill_if(struct tgsi_exec_machine *mach,
1995 const struct tgsi_full_instruction *inst)
1996 {
1997 uint uniquemask;
1998 uint chan_index;
1999 uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2000 union tgsi_exec_channel r[1];
2001
2002 /* This mask stores component bits that were already tested. */
2003 uniquemask = 0;
2004
2005 for (chan_index = 0; chan_index < 4; chan_index++)
2006 {
2007 uint swizzle;
2008 uint i;
2009
2010 /* unswizzle channel */
2011 swizzle = tgsi_util_get_full_src_register_swizzle (
2012 &inst->Src[0],
2013 chan_index);
2014
2015 /* check if the component has not been already tested */
2016 if (uniquemask & (1 << swizzle))
2017 continue;
2018 uniquemask |= 1 << swizzle;
2019
2020 FETCH(&r[0], 0, chan_index);
2021 for (i = 0; i < 4; i++)
2022 if (r[0].f[i] < 0.0f)
2023 kilmask |= 1 << i;
2024 }
2025
2026 /* restrict to fragments currently executing */
2027 kilmask &= mach->ExecMask;
2028
2029 mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2030 }
2031
2032 /**
2033 * Unconditional fragment kill/discard.
2034 */
2035 static void
2036 exec_kill(struct tgsi_exec_machine *mach)
2037 {
2038 uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2039
2040 /* kill fragment for all fragments currently executing */
2041 kilmask = mach->ExecMask;
2042 mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2043 }
2044
2045 static void
2046 emit_vertex(struct tgsi_exec_machine *mach,
2047 const struct tgsi_full_instruction *inst)
2048 {
2049 union tgsi_exec_channel r[1];
2050 unsigned stream_id;
2051 unsigned *prim_count;
2052 /* FIXME: check for exec mask correctly
2053 unsigned i;
2054 for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2055 if ((mach->ExecMask & (1 << i)))
2056 */
2057 IFETCH(&r[0], 0, TGSI_CHAN_X);
2058 stream_id = r[0].u[0];
2059 prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2060 if (mach->ExecMask) {
2061 if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
2062 return;
2063
2064 if (mach->Primitives[stream_id][*prim_count] == 0)
2065 mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
2066 mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2067 mach->Primitives[stream_id][*prim_count]++;
2068 }
2069 }
2070
2071 static void
2072 emit_primitive(struct tgsi_exec_machine *mach,
2073 const struct tgsi_full_instruction *inst)
2074 {
2075 unsigned *prim_count;
2076 union tgsi_exec_channel r[1];
2077 unsigned stream_id = 0;
2078 /* FIXME: check for exec mask correctly
2079 unsigned i;
2080 for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2081 if ((mach->ExecMask & (1 << i)))
2082 */
2083 if (inst) {
2084 IFETCH(&r[0], 0, TGSI_CHAN_X);
2085 stream_id = r[0].u[0];
2086 }
2087 prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2088 if (mach->ExecMask) {
2089 ++(*prim_count);
2090 debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2091 mach->Primitives[stream_id][*prim_count] = 0;
2092 }
2093 }
2094
2095 static void
2096 conditional_emit_primitive(struct tgsi_exec_machine *mach)
2097 {
2098 if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2099 int emitted_verts =
2100 mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
2101 if (emitted_verts) {
2102 emit_primitive(mach, NULL);
2103 }
2104 }
2105 }
2106
2107
2108 /*
2109 * Fetch four texture samples using STR texture coordinates.
2110 */
2111 static void
2112 fetch_texel( struct tgsi_sampler *sampler,
2113 const unsigned sview_idx,
2114 const unsigned sampler_idx,
2115 const union tgsi_exec_channel *s,
2116 const union tgsi_exec_channel *t,
2117 const union tgsi_exec_channel *p,
2118 const union tgsi_exec_channel *c0,
2119 const union tgsi_exec_channel *c1,
2120 float derivs[3][2][TGSI_QUAD_SIZE],
2121 const int8_t offset[3],
2122 enum tgsi_sampler_control control,
2123 union tgsi_exec_channel *r,
2124 union tgsi_exec_channel *g,
2125 union tgsi_exec_channel *b,
2126 union tgsi_exec_channel *a )
2127 {
2128 uint j;
2129 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2130
2131 /* FIXME: handle explicit derivs, offsets */
2132 sampler->get_samples(sampler, sview_idx, sampler_idx,
2133 s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2134
2135 for (j = 0; j < 4; j++) {
2136 r->f[j] = rgba[0][j];
2137 g->f[j] = rgba[1][j];
2138 b->f[j] = rgba[2][j];
2139 a->f[j] = rgba[3][j];
2140 }
2141 }
2142
2143
2144 #define TEX_MODIFIER_NONE 0
2145 #define TEX_MODIFIER_PROJECTED 1
2146 #define TEX_MODIFIER_LOD_BIAS 2
2147 #define TEX_MODIFIER_EXPLICIT_LOD 3
2148 #define TEX_MODIFIER_LEVEL_ZERO 4
2149 #define TEX_MODIFIER_GATHER 5
2150
2151 /*
2152 * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2153 */
2154 static void
2155 fetch_texel_offsets(struct tgsi_exec_machine *mach,
2156 const struct tgsi_full_instruction *inst,
2157 int8_t offsets[3])
2158 {
2159 if (inst->Texture.NumOffsets == 1) {
2160 union tgsi_exec_channel index;
2161 union tgsi_exec_channel offset[3];
2162 index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2163 fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2164 inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2165 fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2166 inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2167 fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2168 inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2169 offsets[0] = offset[0].i[0];
2170 offsets[1] = offset[1].i[0];
2171 offsets[2] = offset[2].i[0];
2172 } else {
2173 assert(inst->Texture.NumOffsets == 0);
2174 offsets[0] = offsets[1] = offsets[2] = 0;
2175 }
2176 }
2177
2178
2179 /*
2180 * Fetch dx and dy values for one channel (s, t or r).
2181 * Put dx values into one float array, dy values into another.
2182 */
2183 static void
2184 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2185 const struct tgsi_full_instruction *inst,
2186 unsigned regdsrcx,
2187 unsigned chan,
2188 float derivs[2][TGSI_QUAD_SIZE])
2189 {
2190 union tgsi_exec_channel d;
2191 FETCH(&d, regdsrcx, chan);
2192 derivs[0][0] = d.f[0];
2193 derivs[0][1] = d.f[1];
2194 derivs[0][2] = d.f[2];
2195 derivs[0][3] = d.f[3];
2196 FETCH(&d, regdsrcx + 1, chan);
2197 derivs[1][0] = d.f[0];
2198 derivs[1][1] = d.f[1];
2199 derivs[1][2] = d.f[2];
2200 derivs[1][3] = d.f[3];
2201 }
2202
2203 static uint
2204 fetch_sampler_unit(struct tgsi_exec_machine *mach,
2205 const struct tgsi_full_instruction *inst,
2206 uint sampler)
2207 {
2208 uint unit = 0;
2209 int i;
2210 if (inst->Src[sampler].Register.Indirect) {
2211 const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2212 union tgsi_exec_channel indir_index, index2;
2213 const uint execmask = mach->ExecMask;
2214 index2.i[0] =
2215 index2.i[1] =
2216 index2.i[2] =
2217 index2.i[3] = reg->Indirect.Index;
2218
2219 fetch_src_file_channel(mach,
2220 reg->Indirect.File,
2221 reg->Indirect.Swizzle,
2222 &index2,
2223 &ZeroVec,
2224 &indir_index);
2225 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2226 if (execmask & (1 << i)) {
2227 unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2228 break;
2229 }
2230 }
2231
2232 } else {
2233 unit = inst->Src[sampler].Register.Index;
2234 }
2235 return unit;
2236 }
2237
2238 /*
2239 * execute a texture instruction.
2240 *
2241 * modifier is used to control the channel routing for the
2242 * instruction variants like proj, lod, and texture with lod bias.
2243 * sampler indicates which src register the sampler is contained in.
2244 */
2245 static void
2246 exec_tex(struct tgsi_exec_machine *mach,
2247 const struct tgsi_full_instruction *inst,
2248 uint modifier, uint sampler)
2249 {
2250 const union tgsi_exec_channel *args[5], *proj = NULL;
2251 union tgsi_exec_channel r[5];
2252 enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2253 uint chan;
2254 uint unit;
2255 int8_t offsets[3];
2256 int dim, shadow_ref, i;
2257
2258 unit = fetch_sampler_unit(mach, inst, sampler);
2259 /* always fetch all 3 offsets, overkill but keeps code simple */
2260 fetch_texel_offsets(mach, inst, offsets);
2261
2262 assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2263 assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2264
2265 dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2266 shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2267
2268 assert(dim <= 4);
2269 if (shadow_ref >= 0)
2270 assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2271
2272 /* fetch modifier to the last argument */
2273 if (modifier != TEX_MODIFIER_NONE) {
2274 const int last = ARRAY_SIZE(args) - 1;
2275
2276 /* fetch modifier from src0.w or src1.x */
2277 if (sampler == 1) {
2278 assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2279 FETCH(&r[last], 0, TGSI_CHAN_W);
2280 }
2281 else {
2282 FETCH(&r[last], 1, TGSI_CHAN_X);
2283 }
2284
2285 if (modifier != TEX_MODIFIER_PROJECTED) {
2286 args[last] = &r[last];
2287 }
2288 else {
2289 proj = &r[last];
2290 args[last] = &ZeroVec;
2291 }
2292
2293 /* point unused arguments to zero vector */
2294 for (i = dim; i < last; i++)
2295 args[i] = &ZeroVec;
2296
2297 if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2298 control = TGSI_SAMPLER_LOD_EXPLICIT;
2299 else if (modifier == TEX_MODIFIER_LOD_BIAS)
2300 control = TGSI_SAMPLER_LOD_BIAS;
2301 else if (modifier == TEX_MODIFIER_GATHER)
2302 control = TGSI_SAMPLER_GATHER;
2303 }
2304 else {
2305 for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2306 args[i] = &ZeroVec;
2307 }
2308
2309 /* fetch coordinates */
2310 for (i = 0; i < dim; i++) {
2311 FETCH(&r[i], 0, TGSI_CHAN_X + i);
2312
2313 if (proj)
2314 micro_div(&r[i], &r[i], proj);
2315
2316 args[i] = &r[i];
2317 }
2318
2319 /* fetch reference value */
2320 if (shadow_ref >= 0) {
2321 FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2322
2323 if (proj)
2324 micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2325
2326 args[shadow_ref] = &r[shadow_ref];
2327 }
2328
2329 fetch_texel(mach->Sampler, unit, unit,
2330 args[0], args[1], args[2], args[3], args[4],
2331 NULL, offsets, control,
2332 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2333
2334 #if 0
2335 debug_printf("fetch r: %g %g %g %g\n",
2336 r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2337 debug_printf("fetch g: %g %g %g %g\n",
2338 r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2339 debug_printf("fetch b: %g %g %g %g\n",
2340 r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2341 debug_printf("fetch a: %g %g %g %g\n",
2342 r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2343 #endif
2344
2345 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2346 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2347 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2348 }
2349 }
2350 }
2351
2352 static void
2353 exec_lodq(struct tgsi_exec_machine *mach,
2354 const struct tgsi_full_instruction *inst)
2355 {
2356 uint resource_unit, sampler_unit;
2357 unsigned dim;
2358 unsigned i;
2359 union tgsi_exec_channel coords[4];
2360 const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2361 union tgsi_exec_channel r[2];
2362
2363 resource_unit = fetch_sampler_unit(mach, inst, 1);
2364 if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2365 uint target = mach->SamplerViews[resource_unit].Resource;
2366 dim = tgsi_util_get_texture_coord_dim(target);
2367 sampler_unit = fetch_sampler_unit(mach, inst, 2);
2368 } else {
2369 dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2370 sampler_unit = resource_unit;
2371 }
2372 assert(dim <= ARRAY_SIZE(coords));
2373 /* fetch coordinates */
2374 for (i = 0; i < dim; i++) {
2375 FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2376 args[i] = &coords[i];
2377 }
2378 for (i = dim; i < ARRAY_SIZE(coords); i++) {
2379 args[i] = &ZeroVec;
2380 }
2381 mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2382 args[0]->f,
2383 args[1]->f,
2384 args[2]->f,
2385 args[3]->f,
2386 TGSI_SAMPLER_LOD_NONE,
2387 r[0].f,
2388 r[1].f);
2389
2390 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2391 store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2392 TGSI_EXEC_DATA_FLOAT);
2393 }
2394 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2395 store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2396 TGSI_EXEC_DATA_FLOAT);
2397 }
2398 if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2399 unsigned char swizzles[4];
2400 unsigned chan;
2401 swizzles[0] = inst->Src[1].Register.SwizzleX;
2402 swizzles[1] = inst->Src[1].Register.SwizzleY;
2403 swizzles[2] = inst->Src[1].Register.SwizzleZ;
2404 swizzles[3] = inst->Src[1].Register.SwizzleW;
2405
2406 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2407 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2408 if (swizzles[chan] >= 2) {
2409 store_dest(mach, &ZeroVec,
2410 &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2411 } else {
2412 store_dest(mach, &r[swizzles[chan]],
2413 &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2414 }
2415 }
2416 }
2417 } else {
2418 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2419 store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2420 TGSI_EXEC_DATA_FLOAT);
2421 }
2422 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2423 store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2424 TGSI_EXEC_DATA_FLOAT);
2425 }
2426 }
2427 }
2428
2429 static void
2430 exec_txd(struct tgsi_exec_machine *mach,
2431 const struct tgsi_full_instruction *inst)
2432 {
2433 union tgsi_exec_channel r[4];
2434 float derivs[3][2][TGSI_QUAD_SIZE];
2435 uint chan;
2436 uint unit;
2437 int8_t offsets[3];
2438
2439 unit = fetch_sampler_unit(mach, inst, 3);
2440 /* always fetch all 3 offsets, overkill but keeps code simple */
2441 fetch_texel_offsets(mach, inst, offsets);
2442
2443 switch (inst->Texture.Texture) {
2444 case TGSI_TEXTURE_1D:
2445 FETCH(&r[0], 0, TGSI_CHAN_X);
2446
2447 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2448
2449 fetch_texel(mach->Sampler, unit, unit,
2450 &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec, /* S, T, P, C, LOD */
2451 derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2452 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2453 break;
2454
2455 case TGSI_TEXTURE_SHADOW1D:
2456 case TGSI_TEXTURE_1D_ARRAY:
2457 case TGSI_TEXTURE_SHADOW1D_ARRAY:
2458 /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2459 FETCH(&r[0], 0, TGSI_CHAN_X);
2460 FETCH(&r[1], 0, TGSI_CHAN_Y);
2461 FETCH(&r[2], 0, TGSI_CHAN_Z);
2462
2463 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2464
2465 fetch_texel(mach->Sampler, unit, unit,
2466 &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec, /* S, T, P, C, LOD */
2467 derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2468 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2469 break;
2470
2471 case TGSI_TEXTURE_2D:
2472 case TGSI_TEXTURE_RECT:
2473 FETCH(&r[0], 0, TGSI_CHAN_X);
2474 FETCH(&r[1], 0, TGSI_CHAN_Y);
2475
2476 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2477 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2478
2479 fetch_texel(mach->Sampler, unit, unit,
2480 &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec, /* S, T, P, C, LOD */
2481 derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2482 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2483 break;
2484
2485
2486 case TGSI_TEXTURE_SHADOW2D:
2487 case TGSI_TEXTURE_SHADOWRECT:
2488 case TGSI_TEXTURE_2D_ARRAY:
2489 case TGSI_TEXTURE_SHADOW2D_ARRAY:
2490 /* only SHADOW2D_ARRAY actually needs W */
2491 FETCH(&r[0], 0, TGSI_CHAN_X);
2492 FETCH(&r[1], 0, TGSI_CHAN_Y);
2493 FETCH(&r[2], 0, TGSI_CHAN_Z);
2494 FETCH(&r[3], 0, TGSI_CHAN_W);
2495
2496 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2497 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2498
2499 fetch_texel(mach->Sampler, unit, unit,
2500 &r[0], &r[1], &r[2], &r[3], &ZeroVec, /* inputs */
2501 derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2502 &r[0], &r[1], &r[2], &r[3]); /* outputs */
2503 break;
2504
2505 case TGSI_TEXTURE_3D:
2506 case TGSI_TEXTURE_CUBE:
2507 case TGSI_TEXTURE_CUBE_ARRAY:
2508 case TGSI_TEXTURE_SHADOWCUBE:
2509 /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2510 FETCH(&r[0], 0, TGSI_CHAN_X);
2511 FETCH(&r[1], 0, TGSI_CHAN_Y);
2512 FETCH(&r[2], 0, TGSI_CHAN_Z);
2513 FETCH(&r[3], 0, TGSI_CHAN_W);
2514
2515 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2516 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2517 fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2518
2519 fetch_texel(mach->Sampler, unit, unit,
2520 &r[0], &r[1], &r[2], &r[3], &ZeroVec, /* inputs */
2521 derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2522 &r[0], &r[1], &r[2], &r[3]); /* outputs */
2523 break;
2524
2525 default:
2526 assert(0);
2527 }
2528
2529 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2530 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2531 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2532 }
2533 }
2534 }
2535
2536
2537 static void
2538 exec_txf(struct tgsi_exec_machine *mach,
2539 const struct tgsi_full_instruction *inst)
2540 {
2541 union tgsi_exec_channel r[4];
2542 uint chan;
2543 uint unit;
2544 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2545 int j;
2546 int8_t offsets[3];
2547 unsigned target;
2548
2549 unit = fetch_sampler_unit(mach, inst, 1);
2550 /* always fetch all 3 offsets, overkill but keeps code simple */
2551 fetch_texel_offsets(mach, inst, offsets);
2552
2553 IFETCH(&r[3], 0, TGSI_CHAN_W);
2554
2555 if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2556 inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2557 target = mach->SamplerViews[unit].Resource;
2558 }
2559 else {
2560 target = inst->Texture.Texture;
2561 }
2562 switch(target) {
2563 case TGSI_TEXTURE_3D:
2564 case TGSI_TEXTURE_2D_ARRAY:
2565 case TGSI_TEXTURE_SHADOW2D_ARRAY:
2566 case TGSI_TEXTURE_2D_ARRAY_MSAA:
2567 IFETCH(&r[2], 0, TGSI_CHAN_Z);
2568 /* fallthrough */
2569 case TGSI_TEXTURE_2D:
2570 case TGSI_TEXTURE_RECT:
2571 case TGSI_TEXTURE_SHADOW1D_ARRAY:
2572 case TGSI_TEXTURE_SHADOW2D:
2573 case TGSI_TEXTURE_SHADOWRECT:
2574 case TGSI_TEXTURE_1D_ARRAY:
2575 case TGSI_TEXTURE_2D_MSAA:
2576 IFETCH(&r[1], 0, TGSI_CHAN_Y);
2577 /* fallthrough */
2578 case TGSI_TEXTURE_BUFFER:
2579 case TGSI_TEXTURE_1D:
2580 case TGSI_TEXTURE_SHADOW1D:
2581 IFETCH(&r[0], 0, TGSI_CHAN_X);
2582 break;
2583 default:
2584 assert(0);
2585 break;
2586 }
2587
2588 mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2589 offsets, rgba);
2590
2591 for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2592 r[0].f[j] = rgba[0][j];
2593 r[1].f[j] = rgba[1][j];
2594 r[2].f[j] = rgba[2][j];
2595 r[3].f[j] = rgba[3][j];
2596 }
2597
2598 if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2599 inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2600 unsigned char swizzles[4];
2601 swizzles[0] = inst->Src[1].Register.SwizzleX;
2602 swizzles[1] = inst->Src[1].Register.SwizzleY;
2603 swizzles[2] = inst->Src[1].Register.SwizzleZ;
2604 swizzles[3] = inst->Src[1].Register.SwizzleW;
2605
2606 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2607 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2608 store_dest(mach, &r[swizzles[chan]],
2609 &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2610 }
2611 }
2612 }
2613 else {
2614 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2615 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2616 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2617 }
2618 }
2619 }
2620 }
2621
2622 static void
2623 exec_txq(struct tgsi_exec_machine *mach,
2624 const struct tgsi_full_instruction *inst)
2625 {
2626 int result[4];
2627 union tgsi_exec_channel r[4], src;
2628 uint chan;
2629 uint unit;
2630 int i,j;
2631
2632 unit = fetch_sampler_unit(mach, inst, 1);
2633
2634 fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2635
2636 /* XXX: This interface can't return per-pixel values */
2637 mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2638
2639 for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2640 for (j = 0; j < 4; j++) {
2641 r[j].i[i] = result[j];
2642 }
2643 }
2644
2645 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2646 if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2647 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2648 TGSI_EXEC_DATA_INT);
2649 }
2650 }
2651 }
2652
2653 static void
2654 exec_sample(struct tgsi_exec_machine *mach,
2655 const struct tgsi_full_instruction *inst,
2656 uint modifier, boolean compare)
2657 {
2658 const uint resource_unit = inst->Src[1].Register.Index;
2659 const uint sampler_unit = inst->Src[2].Register.Index;
2660 union tgsi_exec_channel r[5], c1;
2661 const union tgsi_exec_channel *lod = &ZeroVec;
2662 enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2663 uint chan;
2664 unsigned char swizzles[4];
2665 int8_t offsets[3];
2666
2667 /* always fetch all 3 offsets, overkill but keeps code simple */
2668 fetch_texel_offsets(mach, inst, offsets);
2669
2670 assert(modifier != TEX_MODIFIER_PROJECTED);
2671
2672 if (modifier != TEX_MODIFIER_NONE) {
2673 if (modifier == TEX_MODIFIER_LOD_BIAS) {
2674 FETCH(&c1, 3, TGSI_CHAN_X);
2675 lod = &c1;
2676 control = TGSI_SAMPLER_LOD_BIAS;
2677 }
2678 else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2679 FETCH(&c1, 3, TGSI_CHAN_X);
2680 lod = &c1;
2681 control = TGSI_SAMPLER_LOD_EXPLICIT;
2682 }
2683 else if (modifier == TEX_MODIFIER_GATHER) {
2684 control = TGSI_SAMPLER_GATHER;
2685 }
2686 else {
2687 assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2688 control = TGSI_SAMPLER_LOD_ZERO;
2689 }
2690 }
2691
2692 FETCH(&r[0], 0, TGSI_CHAN_X);
2693
2694 switch (mach->SamplerViews[resource_unit].Resource) {
2695 case TGSI_TEXTURE_1D:
2696 if (compare) {
2697 FETCH(&r[2], 3, TGSI_CHAN_X);
2698 fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2699 &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2700 NULL, offsets, control,
2701 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2702 }
2703 else {
2704 fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2705 &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2706 NULL, offsets, control,
2707 &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
2708 }
2709 break;
2710
2711 case TGSI_TEXTURE_1D_ARRAY:
2712 case TGSI_TEXTURE_2D:
2713 case TGSI_TEXTURE_RECT:
2714 FETCH(&r[1], 0, TGSI_CHAN_Y);
2715 if (compare) {
2716 FETCH(&r[2], 3, TGSI_CHAN_X);
2717 fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2718 &r[0], &r[1], &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2719 NULL, offsets, control,
2720 &r[0], &r[1], &r[2], &r[3]); /* outputs */
2721 }
2722 else {
2723 fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2724 &r[0], &r[1], &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2725 NULL, offsets, control,
2726 &r[0], &r[1], &r[2], &r[