Merge branch 'origin' into glsl-compiler-1
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /*
29 * Authors:
30 * Ben Skeggs <darktama@iinet.net.au>
31 * Jerome Glisse <j.glisse@gmail.com>
32 */
33
34 /*TODO'S
35 *
36 * - Depth write, WPOS/FOGC inputs
37 * - FogOption
38 * - Verify results of opcodes for accuracy, I've only checked them
39 * in specific cases.
40 * - and more...
41 */
42
43 #include "glheader.h"
44 #include "macros.h"
45 #include "enums.h"
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
49
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
52 #include "r300_reg.h"
53
54 /*
55 * Usefull macros and values
56 */
57 #define ERROR(fmt, args...) do { \
58 fprintf(stderr, "%s::%s(): " fmt "\n", \
59 __FILE__, __func__, ##args); \
60 rp->error = GL_TRUE; \
61 } while(0)
62
63 #define PFS_INVAL 0xFFFFFFFF
64 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
65
66 #define SWIZZLE_XYZ 0
67 #define SWIZZLE_XXX 1
68 #define SWIZZLE_YYY 2
69 #define SWIZZLE_ZZZ 3
70 #define SWIZZLE_WWW 4
71 #define SWIZZLE_YZX 5
72 #define SWIZZLE_ZXY 6
73 #define SWIZZLE_WZY 7
74 #define SWIZZLE_111 8
75 #define SWIZZLE_000 9
76 #define SWIZZLE_HHH 10
77
78 #define swizzle(r, x, y, z, w) do_swizzle(rp, r, \
79 ((SWIZZLE_##x<<0)| \
80 (SWIZZLE_##y<<3)| \
81 (SWIZZLE_##z<<6)| \
82 (SWIZZLE_##w<<9)), \
83 0)
84
85 #define REG_TYPE_INPUT 0
86 #define REG_TYPE_OUTPUT 1
87 #define REG_TYPE_TEMP 2
88 #define REG_TYPE_CONST 3
89
90 #define REG_TYPE_SHIFT 0
91 #define REG_INDEX_SHIFT 2
92 #define REG_VSWZ_SHIFT 8
93 #define REG_SSWZ_SHIFT 13
94 #define REG_NEGV_SHIFT 18
95 #define REG_NEGS_SHIFT 19
96 #define REG_ABS_SHIFT 20
97 #define REG_NO_USE_SHIFT 21
98 #define REG_VALID_SHIFT 22
99
100 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
101 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
102 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
103 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
104 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
105 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
106 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
107 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
108 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
109
110 #define REG(type, index, vswz, sswz, nouse, valid) \
111 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
112 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
113 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
114 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
115 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
116 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
117 #define REG_GET_TYPE(reg) \
118 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
119 #define REG_GET_INDEX(reg) \
120 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
121 #define REG_GET_VSWZ(reg) \
122 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
123 #define REG_GET_SSWZ(reg) \
124 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
125 #define REG_GET_NO_USE(reg) \
126 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
127 #define REG_GET_VALID(reg) \
128 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
129 #define REG_SET_TYPE(reg, type) \
130 reg = ((reg & ~REG_TYPE_MASK) | \
131 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
132 #define REG_SET_INDEX(reg, index) \
133 reg = ((reg & ~REG_INDEX_MASK) | \
134 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
135 #define REG_SET_VSWZ(reg, vswz) \
136 reg = ((reg & ~REG_VSWZ_MASK) | \
137 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
138 #define REG_SET_SSWZ(reg, sswz) \
139 reg = ((reg & ~REG_SSWZ_MASK) | \
140 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
141 #define REG_SET_NO_USE(reg, nouse) \
142 reg = ((reg & ~REG_NO_USE_MASK) | \
143 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
144 #define REG_SET_VALID(reg, valid) \
145 reg = ((reg & ~REG_VALID_MASK) | \
146 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
147 #define REG_ABS(reg) \
148 reg = (reg | REG_ABS_MASK)
149 #define REG_NEGV(reg) \
150 reg = (reg | REG_NEGV_MASK)
151 #define REG_NEGS(reg) \
152 reg = (reg | REG_NEGS_MASK)
153
154
155 /*
156 * Datas structures for fragment program generation
157 */
158
159 /* description of r300 native hw instructions */
160 static const struct {
161 const char *name;
162 int argc;
163 int v_op;
164 int s_op;
165 } r300_fpop[] = {
166 { "MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD },
167 { "DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4 },
168 { "DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4 },
169 { "MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN },
170 { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX },
171 { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP },
172 { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC },
173 { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 },
174 { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 },
175 { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP },
176 { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ },
177 { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL },
178 { "CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL },
179 };
180
181
182 /* vector swizzles r300 can support natively, with a couple of
183 * cases we handle specially
184 *
185 * REG_VSWZ/REG_SSWZ is an index into this table
186 */
187 #define SLOT_VECTOR (1<<0)
188 #define SLOT_SCALAR (1<<3)
189 #define SLOT_BOTH (SLOT_VECTOR | SLOT_SCALAR)
190
191 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
192 #define SWIZZLE_HALF 6
193
194 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
195 SWIZZLE_##y, \
196 SWIZZLE_##z, \
197 SWIZZLE_ZERO))
198 static const struct r300_pfs_swizzle {
199 GLuint hash; /* swizzle value this matches */
200 GLuint base; /* base value for hw swizzle */
201 GLuint stride; /* difference in base between arg0/1/2 */
202 GLuint flags;
203 } v_swiz[] = {
204 /* native swizzles */
205 { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR },
206 { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR },
207 { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR },
208 { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR },
209 { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SCALAR },
210 { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR },
211 { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR },
212 { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
213 { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
214 { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
215 { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
216 { PFS_INVAL, 0, 0, 0},
217 };
218
219 /* used during matching of non-native swizzles */
220 #define SWZ_X_MASK (7 << 0)
221 #define SWZ_Y_MASK (7 << 3)
222 #define SWZ_Z_MASK (7 << 6)
223 #define SWZ_W_MASK (7 << 9)
224 static const struct {
225 GLuint hash; /* used to mask matching swizzle components */
226 int mask; /* actual outmask */
227 int count; /* count of components matched */
228 } s_mask[] = {
229 { SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK, 1|2|4, 3},
230 { SWZ_X_MASK|SWZ_Y_MASK, 1|2, 2},
231 { SWZ_X_MASK|SWZ_Z_MASK, 1|4, 2},
232 { SWZ_Y_MASK|SWZ_Z_MASK, 2|4, 2},
233 { SWZ_X_MASK, 1, 1},
234 { SWZ_Y_MASK, 2, 1},
235 { SWZ_Z_MASK, 4, 1},
236 { PFS_INVAL, PFS_INVAL, PFS_INVAL}
237 };
238
239 static const struct {
240 int base; /* hw value of swizzle */
241 int stride; /* difference between SRC0/1/2 */
242 GLuint flags;
243 } s_swiz[] = {
244 { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR },
245 { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR },
246 { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR },
247 { R300_FPI2_ARGA_SRC0A , 1, SLOT_SCALAR },
248 { R300_FPI2_ARGA_ZERO , 0, 0 },
249 { R300_FPI2_ARGA_ONE , 0, 0 },
250 { R300_FPI2_ARGA_HALF , 0, 0 }
251 };
252
253 /* boiler-plate reg, for convenience */
254 static const GLuint undef = REG(REG_TYPE_TEMP,
255 0,
256 SWIZZLE_XYZ,
257 SWIZZLE_W,
258 GL_FALSE,
259 GL_FALSE);
260
261 /* constant one source */
262 static const GLuint pfs_one = REG(REG_TYPE_CONST,
263 0,
264 SWIZZLE_111,
265 SWIZZLE_ONE,
266 GL_FALSE,
267 GL_TRUE);
268
269 /* constant half source */
270 static const GLuint pfs_half = REG(REG_TYPE_CONST,
271 0,
272 SWIZZLE_HHH,
273 SWIZZLE_HALF,
274 GL_FALSE,
275 GL_TRUE);
276
277 /* constant zero source */
278 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
279 0,
280 SWIZZLE_000,
281 SWIZZLE_ZERO,
282 GL_FALSE,
283 GL_TRUE);
284
285 /*
286 * Common functions prototypes
287 */
288 static void dump_program(struct r300_fragment_program *rp);
289 static void emit_arith(struct r300_fragment_program *rp, int op,
290 GLuint dest, int mask,
291 GLuint src0, GLuint src1, GLuint src2,
292 int flags);
293
294 /*
295 * Helper functions prototypes
296 */
297 static int get_hw_temp(struct r300_fragment_program *rp)
298 {
299 COMPILE_STATE;
300 int r = ffs(~cs->hwreg_in_use);
301 if (!r) {
302 ERROR("Out of hardware temps\n");
303 return 0;
304 }
305
306 cs->hwreg_in_use |= (1 << --r);
307 if (r > rp->max_temp_idx)
308 rp->max_temp_idx = r;
309
310 return r;
311 }
312
313 static int get_hw_temp_tex(struct r300_fragment_program *rp)
314 {
315 COMPILE_STATE;
316 int r;
317
318 r = ffs(~(cs->hwreg_in_use | cs->used_in_node));
319 if (!r)
320 return get_hw_temp(rp); /* Will cause an indirection */
321
322 cs->hwreg_in_use |= (1 << --r);
323 if (r > rp->max_temp_idx)
324 rp->max_temp_idx = r;
325
326 return r;
327 }
328
329 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
330 {
331 COMPILE_STATE;
332 cs->hwreg_in_use &= ~(1<<idx);
333 }
334
335 static GLuint get_temp_reg(struct r300_fragment_program *rp)
336 {
337 COMPILE_STATE;
338 GLuint r = undef;
339 GLuint index;
340
341 index = ffs(~cs->temp_in_use);
342 if (!index) {
343 ERROR("Out of program temps\n");
344 return r;
345 }
346
347 cs->temp_in_use |= (1 << --index);
348 cs->temps[index].refcount = 0xFFFFFFFF;
349 cs->temps[index].reg = -1;
350
351 REG_SET_TYPE(r, REG_TYPE_TEMP);
352 REG_SET_INDEX(r, index);
353 REG_SET_VALID(r, GL_TRUE);
354 return r;
355 }
356
357 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
358 {
359 COMPILE_STATE;
360 GLuint r = undef;
361 GLuint index;
362
363 index = ffs(~cs->temp_in_use);
364 if (!index) {
365 ERROR("Out of program temps\n");
366 return r;
367 }
368
369 cs->temp_in_use |= (1 << --index);
370 cs->temps[index].refcount = 0xFFFFFFFF;
371 cs->temps[index].reg = get_hw_temp_tex(rp);
372
373 REG_SET_TYPE(r, REG_TYPE_TEMP);
374 REG_SET_INDEX(r, index);
375 REG_SET_VALID(r, GL_TRUE);
376 return r;
377 }
378
379 static void free_temp(struct r300_fragment_program *rp, GLuint r)
380 {
381 COMPILE_STATE;
382 GLuint index = REG_GET_INDEX(r);
383
384 if (!(cs->temp_in_use & (1 << index)))
385 return;
386
387 if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
388 free_hw_temp(rp, cs->temps[index].reg);
389 cs->temps[index].reg = -1;
390 cs->temp_in_use &= ~(1 << index);
391 } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
392 free_hw_temp(rp, cs->inputs[index].reg);
393 cs->inputs[index].reg = -1;
394 }
395 }
396
397 static GLuint emit_param4fv(struct r300_fragment_program *rp,
398 GLfloat *values)
399 {
400 GLuint r = undef;
401 GLuint index;
402 int pidx;
403
404 pidx = rp->param_nr++;
405 index = rp->const_nr++;
406 if (pidx >= PFS_NUM_CONST_REGS || index >= PFS_NUM_CONST_REGS) {
407 ERROR("Out of const/param slots!\n");
408 return r;
409 }
410
411 rp->param[pidx].idx = index;
412 rp->param[pidx].values = values;
413 rp->params_uptodate = GL_FALSE;
414
415 REG_SET_TYPE(r, REG_TYPE_CONST);
416 REG_SET_INDEX(r, index);
417 REG_SET_VALID(r, GL_TRUE);
418 return r;
419 }
420
421 static GLuint emit_const4fv(struct r300_fragment_program *rp, GLfloat *cp)
422 {
423 GLuint r = undef;
424 GLuint index;
425
426 index = rp->const_nr++;
427 if (index >= PFS_NUM_CONST_REGS) {
428 ERROR("Out of hw constants!\n");
429 return r;
430 }
431
432 COPY_4V(rp->constant[index], cp);
433
434 REG_SET_TYPE(r, REG_TYPE_CONST);
435 REG_SET_INDEX(r, index);
436 REG_SET_VALID(r, GL_TRUE);
437 return r;
438 }
439
440 static inline GLuint negate(GLuint r)
441 {
442 REG_NEGS(r);
443 REG_NEGV(r);
444 return r;
445 }
446
447 /* Hack, to prevent clobbering sources used multiple times when
448 * emulating non-native instructions
449 */
450 static inline GLuint keep(GLuint r)
451 {
452 REG_SET_NO_USE(r, GL_TRUE);
453 return r;
454 }
455
456 static inline GLuint absolute(GLuint r)
457 {
458 REG_ABS(r);
459 return r;
460 }
461
462 static int swz_native(struct r300_fragment_program *rp,
463 GLuint src,
464 GLuint *r,
465 GLuint arbneg)
466 {
467 /* Native swizzle, handle negation */
468 src = (src & ~REG_NEGS_MASK) |
469 (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
470
471 if ((arbneg & 0x7) == 0x0) {
472 src = src & ~REG_NEGV_MASK;
473 *r = src;
474 } else if ((arbneg & 0x7) == 0x7) {
475 src |= REG_NEGV_MASK;
476 *r = src;
477 } else {
478 if (!REG_GET_VALID(*r))
479 *r = get_temp_reg(rp);
480 src |= REG_NEGV_MASK;
481 emit_arith(rp,
482 PFS_OP_MAD,
483 *r,
484 arbneg & 0x7,
485 keep(src),
486 pfs_one,
487 pfs_zero,
488 0);
489 src = src & ~REG_NEGV_MASK;
490 emit_arith(rp,
491 PFS_OP_MAD,
492 *r,
493 (arbneg ^ 0x7) | WRITEMASK_W,
494 src,
495 pfs_one,
496 pfs_zero,
497 0);
498 }
499
500 return 3;
501 }
502
503 static int swz_emit_partial(struct r300_fragment_program *rp,
504 GLuint src,
505 GLuint *r,
506 int mask,
507 int mc,
508 GLuint arbneg)
509 {
510 GLuint tmp;
511 GLuint wmask = 0;
512
513 if (!REG_GET_VALID(*r))
514 *r = get_temp_reg(rp);
515
516 /* A partial match, VSWZ/mask define what parts of the
517 * desired swizzle we match
518 */
519 if (mc + s_mask[mask].count == 3) {
520 wmask = WRITEMASK_W;
521 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
522 }
523
524 tmp = arbneg & s_mask[mask].mask;
525 if (tmp) {
526 tmp = tmp ^ s_mask[mask].mask;
527 if (tmp) {
528 emit_arith(rp,
529 PFS_OP_MAD,
530 *r,
531 arbneg & s_mask[mask].mask,
532 keep(src) | REG_NEGV_MASK,
533 pfs_one,
534 pfs_zero,
535 0);
536 if (!wmask) {
537 REG_SET_NO_USE(src, GL_TRUE);
538 } else {
539 REG_SET_NO_USE(src, GL_FALSE);
540 }
541 emit_arith(rp,
542 PFS_OP_MAD,
543 *r,
544 tmp | wmask,
545 src,
546 pfs_one,
547 pfs_zero,
548 0);
549 } else {
550 if (!wmask) {
551 REG_SET_NO_USE(src, GL_TRUE);
552 } else {
553 REG_SET_NO_USE(src, GL_FALSE);
554 }
555 emit_arith(rp,
556 PFS_OP_MAD,
557 *r,
558 (arbneg & s_mask[mask].mask) | wmask,
559 src | REG_NEGV_MASK,
560 pfs_one,
561 pfs_zero,
562 0);
563 }
564 } else {
565 if (!wmask) {
566 REG_SET_NO_USE(src, GL_TRUE);
567 } else {
568 REG_SET_NO_USE(src, GL_FALSE);
569 }
570 emit_arith(rp, PFS_OP_MAD,
571 *r,
572 s_mask[mask].mask | wmask,
573 src,
574 pfs_one,
575 pfs_zero,
576 0);
577 }
578
579 return s_mask[mask].count;
580 }
581
582 static GLuint do_swizzle(struct r300_fragment_program *rp,
583 GLuint src,
584 GLuint arbswz,
585 GLuint arbneg)
586 {
587 GLuint r = undef;
588 GLuint vswz;
589 int c_mask = 0;
590 int v_match = 0;
591
592 /* If swizzling from something without an XYZW native swizzle,
593 * emit result to a temp, and do new swizzle from the temp.
594 */
595 #if 0
596 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
597 REG_GET_SSWZ(src) != SWIZZLE_W) {
598 GLuint temp = get_temp_reg(rp);
599 emit_arith(rp,
600 PFS_OP_MAD,
601 temp,
602 WRITEMASK_XYZW,
603 src,
604 pfs_one,
605 pfs_zero,
606 0);
607 src = temp;
608 }
609 #endif
610
611 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
612 REG_GET_SSWZ(src) != SWIZZLE_W) {
613 GLuint vsrcswz = (v_swiz[REG_GET_VSWZ(src)].hash & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK)) | REG_GET_SSWZ(src) << 9;
614 GLint i;
615
616 GLuint newswz = 0;
617 GLuint offset;
618 for(i=0; i < 4; ++i){
619 offset = GET_SWZ(arbswz, i);
620
621 newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
622 }
623
624 arbswz = newswz & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK);
625 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
626 }
627 else
628 {
629 /* set scalar swizzling */
630 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
631
632 }
633 do {
634 vswz = REG_GET_VSWZ(src);
635 do {
636 int chash;
637
638 REG_SET_VSWZ(src, vswz);
639 chash = v_swiz[REG_GET_VSWZ(src)].hash &
640 s_mask[c_mask].hash;
641
642 if (chash == (arbswz & s_mask[c_mask].hash)) {
643 if (s_mask[c_mask].count == 3) {
644 v_match += swz_native(rp,
645 src,
646 &r,
647 arbneg);
648 } else {
649 v_match += swz_emit_partial(rp,
650 src,
651 &r,
652 c_mask,
653 v_match,
654 arbneg);
655 }
656
657 if (v_match == 3)
658 return r;
659
660 /* Fill with something invalid.. all 0's was
661 * wrong before, matched SWIZZLE_X. So all
662 * 1's will be okay for now
663 */
664 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
665 }
666 } while(v_swiz[++vswz].hash != PFS_INVAL);
667 REG_SET_VSWZ(src, SWIZZLE_XYZ);
668 } while (s_mask[++c_mask].hash != PFS_INVAL);
669
670 ERROR("should NEVER get here\n");
671 return r;
672 }
673
674 static GLuint t_src(struct r300_fragment_program *rp,
675 struct prog_src_register fpsrc)
676 {
677 GLuint r = undef;
678
679 switch (fpsrc.File) {
680 case PROGRAM_TEMPORARY:
681 REG_SET_INDEX(r, fpsrc.Index);
682 REG_SET_VALID(r, GL_TRUE);
683 REG_SET_TYPE(r, REG_TYPE_TEMP);
684 break;
685 case PROGRAM_INPUT:
686 REG_SET_INDEX(r, fpsrc.Index);
687 REG_SET_VALID(r, GL_TRUE);
688 REG_SET_TYPE(r, REG_TYPE_INPUT);
689 break;
690 case PROGRAM_LOCAL_PARAM:
691 r = emit_param4fv(rp,
692 rp->mesa_program.Base.LocalParams[fpsrc.Index]);
693 break;
694 case PROGRAM_ENV_PARAM:
695 r = emit_param4fv(rp,
696 rp->ctx->FragmentProgram.Parameters[fpsrc.Index]);
697 break;
698 case PROGRAM_STATE_VAR:
699 case PROGRAM_NAMED_PARAM:
700 r = emit_param4fv(rp,
701 rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]);
702 break;
703 default:
704 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
705 return r;
706 }
707
708 /* no point swizzling ONE/ZERO/HALF constants... */
709 if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
710 r = do_swizzle(rp, r, fpsrc.Swizzle, fpsrc.NegateBase);
711 return r;
712 }
713
714 static GLuint t_scalar_src(struct r300_fragment_program *rp,
715 struct prog_src_register fpsrc)
716 {
717 struct prog_src_register src = fpsrc;
718 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
719
720 src.Swizzle = ((sc<<0)|(sc<<3)|(sc<<6)|(sc<<9));
721
722 return t_src(rp, src);
723 }
724
725 static GLuint t_dst(struct r300_fragment_program *rp,
726 struct prog_dst_register dest)
727 {
728 GLuint r = undef;
729
730 switch (dest.File) {
731 case PROGRAM_TEMPORARY:
732 REG_SET_INDEX(r, dest.Index);
733 REG_SET_VALID(r, GL_TRUE);
734 REG_SET_TYPE(r, REG_TYPE_TEMP);
735 return r;
736 case PROGRAM_OUTPUT:
737 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
738 switch (dest.Index) {
739 case FRAG_RESULT_COLR:
740 case FRAG_RESULT_DEPR:
741 REG_SET_INDEX(r, dest.Index);
742 REG_SET_VALID(r, GL_TRUE);
743 return r;
744 default:
745 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
746 return r;
747 }
748 default:
749 ERROR("Bad DstReg->File 0x%x\n", dest.File);
750 return r;
751 }
752 }
753
754 static int t_hw_src(struct r300_fragment_program *rp,
755 GLuint src,
756 GLboolean tex)
757 {
758 COMPILE_STATE;
759 int idx;
760 int index = REG_GET_INDEX(src);
761
762 switch(REG_GET_TYPE(src)) {
763 case REG_TYPE_TEMP:
764 /* NOTE: if reg==-1 here, a source is being read that
765 * hasn't been written to. Undefined results
766 */
767 if (cs->temps[index].reg == -1)
768 cs->temps[index].reg = get_hw_temp(rp);
769
770 idx = cs->temps[index].reg;
771
772 if (!REG_GET_NO_USE(src) &&
773 (--cs->temps[index].refcount == 0))
774 free_temp(rp, src);
775 break;
776 case REG_TYPE_INPUT:
777 idx = cs->inputs[index].reg;
778
779 if (!REG_GET_NO_USE(src) &&
780 (--cs->inputs[index].refcount == 0))
781 free_hw_temp(rp, cs->inputs[index].reg);
782 break;
783 case REG_TYPE_CONST:
784 return (index | SRC_CONST);
785 default:
786 ERROR("Invalid type for source reg\n");
787 return (0 | SRC_CONST);
788 }
789
790 if (!tex)
791 cs->used_in_node |= (1 << idx);
792
793 return idx;
794 }
795
796 static int t_hw_dst(struct r300_fragment_program *rp,
797 GLuint dest,
798 GLboolean tex)
799 {
800 COMPILE_STATE;
801 int idx;
802 GLuint index = REG_GET_INDEX(dest);
803 assert(REG_GET_VALID(dest));
804
805 switch(REG_GET_TYPE(dest)) {
806 case REG_TYPE_TEMP:
807 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
808 if (!tex) {
809 cs->temps[index].reg = get_hw_temp(rp);
810 } else {
811 cs->temps[index].reg = get_hw_temp_tex(rp);
812 }
813 }
814 idx = cs->temps[index].reg;
815
816 if (!REG_GET_NO_USE(dest) &&
817 (--cs->temps[index].refcount == 0))
818 free_temp(rp, dest);
819
820 cs->dest_in_node |= (1 << idx);
821 cs->used_in_node |= (1 << idx);
822 break;
823 case REG_TYPE_OUTPUT:
824 switch(index) {
825 case FRAG_RESULT_COLR:
826 rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_COLOR;
827 break;
828 case FRAG_RESULT_DEPR:
829 rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_DEPTH;
830 break;
831 }
832 return index;
833 break;
834 default:
835 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
836 return 0;
837 }
838
839 return idx;
840 }
841
842 static void emit_nop(struct r300_fragment_program *rp,
843 GLuint mask,
844 GLboolean sync)
845 {
846 COMPILE_STATE;
847
848 if (sync)
849 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
850
851 if (mask & WRITEMASK_XYZ) {
852 rp->alu.inst[cs->v_pos].inst0 = NOP_INST0;
853 rp->alu.inst[cs->v_pos].inst1 = NOP_INST1;
854 cs->v_pos++;
855 }
856
857 if (mask & WRITEMASK_W) {
858 rp->alu.inst[cs->s_pos].inst2 = NOP_INST2;
859 rp->alu.inst[cs->s_pos].inst3 = NOP_INST3;
860 cs->s_pos++;
861 }
862 }
863
864 static void emit_tex(struct r300_fragment_program *rp,
865 struct prog_instruction *fpi,
866 int opcode)
867 {
868 COMPILE_STATE;
869 GLuint coord = t_src(rp, fpi->SrcReg[0]);
870 GLuint dest = undef, rdest = undef;
871 GLuint din = cs->dest_in_node, uin = cs->used_in_node;
872 int unit = fpi->TexSrcUnit;
873 int hwsrc, hwdest;
874
875 /* Resolve source/dest to hardware registers */
876 hwsrc = t_hw_src(rp, coord, GL_TRUE);
877 if (opcode != R300_FPITX_OP_KIL) {
878 dest = t_dst(rp, fpi->DstReg);
879
880 /* r300 doesn't seem to be able to do TEX->output reg */
881 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
882 rdest = dest;
883 dest = get_temp_reg_tex(rp);
884 }
885 hwdest = t_hw_dst(rp, dest, GL_TRUE);
886
887 /* Use a temp that hasn't been used in this node, rather
888 * than causing an indirection
889 */
890 if (uin & (1 << hwdest)) {
891 free_hw_temp(rp, hwdest);
892 hwdest = get_hw_temp_tex(rp);
893 cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
894 }
895 } else {
896 hwdest = 0;
897 unit = 0;
898 }
899
900 /* Indirection if source has been written in this node, or if the
901 * dest has been read/written in this node
902 */
903 if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
904 (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
905
906 /* Finish off current node */
907 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
908 if (rp->node[rp->cur_node].alu_offset == cs->v_pos) {
909 /* No alu instructions in the node? Emit a NOP. */
910 emit_nop(rp, WRITEMASK_XYZW, GL_TRUE);
911 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
912 }
913
914 rp->node[rp->cur_node].alu_end =
915 cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
916 assert(rp->node[rp->cur_node].alu_end >= 0);
917
918 if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
919 ERROR("too many levels of texture indirection\n");
920 return;
921 }
922
923 /* Start new node */
924 rp->node[rp->cur_node].tex_offset = rp->tex.length;
925 rp->node[rp->cur_node].alu_offset = cs->v_pos;
926 rp->node[rp->cur_node].tex_end = -1;
927 rp->node[rp->cur_node].alu_end = -1;
928 rp->node[rp->cur_node].flags = 0;
929 cs->used_in_node = 0;
930 cs->dest_in_node = 0;
931 }
932
933 if (rp->cur_node == 0)
934 rp->first_node_has_tex = 1;
935
936 rp->tex.inst[rp->tex.length++] = 0
937 | (hwsrc << R300_FPITX_SRC_SHIFT)
938 | (hwdest << R300_FPITX_DST_SHIFT)
939 | (unit << R300_FPITX_IMAGE_SHIFT)
940 /* not entirely sure about this */
941 | (opcode << R300_FPITX_OPCODE_SHIFT);
942
943 cs->dest_in_node |= (1 << hwdest);
944 if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
945 cs->used_in_node |= (1 << hwsrc);
946
947 rp->node[rp->cur_node].tex_end++;
948
949 /* Copy from temp to output if needed */
950 if (REG_GET_VALID(rdest)) {
951 emit_arith(rp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
952 pfs_one, pfs_zero, 0);
953 free_temp(rp, dest);
954 }
955 }
956
957 /* Add sources to FPI1/FPI3 lists. If source is already on list,
958 * reuse the index instead of wasting a source.
959 */
960 static int add_src(struct r300_fragment_program *rp,
961 int reg,
962 int pos,
963 int srcmask)
964 {
965 COMPILE_STATE;
966 int csm, i;
967
968 /* Look for matches */
969 for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
970 /* If sources have been allocated in this position(s)... */
971 if ((cs->slot[pos].umask & csm) == csm) {
972 /* ... and the register number(s) match, re-use the
973 source */
974 if (srcmask == SLOT_VECTOR &&
975 cs->slot[pos].vsrc[i] == reg)
976 return i;
977 if (srcmask == SLOT_SCALAR &&
978 cs->slot[pos].ssrc[i] == reg)
979 return i;
980 if (srcmask == SLOT_BOTH &&
981 cs->slot[pos].vsrc[i] == reg &&
982 cs->slot[pos].ssrc[i] == reg)
983 return i;
984 }
985 }
986
987 /* Look for free spaces */
988 for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
989 /* If the position(s) haven't been allocated */
990 if ((cs->slot[pos].umask & csm) == 0) {
991 cs->slot[pos].umask |= csm;
992
993 if (srcmask & SLOT_VECTOR)
994 cs->slot[pos].vsrc[i] = reg;
995 if (srcmask & SLOT_SCALAR)
996 cs->slot[pos].ssrc[i] = reg;
997 return i;
998 }
999 }
1000
1001 //ERROR("Failed to allocate sources in FPI1/FPI3!\n");
1002 return 0;
1003 }
1004
1005 /* Determine whether or not to position opcode in the same ALU slot for both
1006 * vector and scalar portions of an instruction.
1007 *
1008 * It's not necessary to force the first case, but it makes disassembled
1009 * shaders easier to read.
1010 */
1011 static GLboolean force_same_slot(int vop,
1012 int sop,
1013 GLboolean emit_vop,
1014 GLboolean emit_sop,
1015 int argc,
1016 GLuint *src)
1017 {
1018 int i;
1019
1020 if (emit_vop && emit_sop)
1021 return GL_TRUE;
1022
1023 if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA)
1024 return GL_TRUE;
1025
1026 if (emit_vop) {
1027 for (i=0;i<argc;i++)
1028 if (REG_GET_VSWZ(src[i]) == SWIZZLE_WZY)
1029 return GL_TRUE;
1030 }
1031
1032 return GL_FALSE;
1033 }
1034
1035 static void emit_arith(struct r300_fragment_program *rp,
1036 int op,
1037 GLuint dest,
1038 int mask,
1039 GLuint src0,
1040 GLuint src1,
1041 GLuint src2,
1042 int flags)
1043 {
1044 COMPILE_STATE;
1045 GLuint src[3] = { src0, src1, src2 };
1046 int hwsrc[3], sswz[3], vswz[3];
1047 int hwdest;
1048 GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE;
1049 int vop, sop, argc;
1050 int vpos, spos;
1051 int i;
1052
1053 vop = r300_fpop[op].v_op;
1054 sop = r300_fpop[op].s_op;
1055 argc = r300_fpop[op].argc;
1056
1057 if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
1058 emit_vop = GL_TRUE;
1059 if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
1060 emit_sop = GL_TRUE;
1061
1062 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1063 REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
1064 emit_vop = GL_FALSE;
1065
1066 if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) {
1067 vpos = spos = MAX2(cs->v_pos, cs->s_pos);
1068 } else {
1069 vpos = cs->v_pos;
1070 spos = cs->s_pos;
1071 /* Here is where we'd decide on where a safe place is to
1072 * combine this instruction with a previous one.
1073 *
1074 * This is extremely simple for now.. if a source depends
1075 * on the opposite stream, force the same instruction.
1076 */
1077 for (i=0;i<3;i++) {
1078 if (emit_vop &&
1079 (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) {
1080 vpos = spos = MAX2(vpos, spos);
1081 break;
1082 }
1083 if (emit_sop &&
1084 (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) {
1085 vpos = spos = MAX2(vpos, spos);
1086 break;
1087 }
1088 }
1089 }
1090
1091 /* - Convert src->hwsrc, record for FPI1/FPI3
1092 * - Determine ARG parts of FPI0/FPI2, unused args are filled
1093 * with ARG_ZERO.
1094 */
1095 for (i=0;i<3;i++) {
1096 int srcpos;
1097
1098 if (i >= argc) {
1099 vswz[i] = R300_FPI0_ARGC_ZERO;
1100 sswz[i] = R300_FPI2_ARGA_ZERO;
1101 continue;
1102 }
1103
1104 hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);
1105
1106 if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) {
1107 srcpos = add_src(rp, hwsrc[i], vpos,
1108 v_swiz[REG_GET_VSWZ(src[i])].flags);
1109 vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1110 (srcpos *
1111 v_swiz[REG_GET_VSWZ(src[i])].stride)) |
1112 ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
1113 ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1114 } else vswz[i] = R300_FPI0_ARGC_ZERO;
1115
1116 if (emit_sop) {
1117 srcpos = add_src(rp, hwsrc[i], spos,
1118 s_swiz[REG_GET_SSWZ(src[i])].flags);
1119 sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1120 (srcpos *
1121 s_swiz[REG_GET_SSWZ(src[i])].stride)) |
1122 ((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) |
1123 ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1124 } else sswz[i] = R300_FPI2_ARGA_ZERO;
1125 }
1126 hwdest = t_hw_dst(rp, dest, GL_FALSE);
1127
1128 if (flags & PFS_FLAG_SAT) {
1129 vop |= R300_FPI0_OUTC_SAT;
1130 sop |= R300_FPI2_OUTA_SAT;
1131 }
1132
1133 /* Throw the pieces together and get FPI0/1 */
1134 rp->alu.inst[vpos].inst1 =
1135 ((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
1136 (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
1137 (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
1138 if (emit_vop) {
1139 rp->alu.inst[vpos].inst0 = vop |
1140 (vswz[0] << R300_FPI0_ARG0C_SHIFT) |
1141 (vswz[1] << R300_FPI0_ARG1C_SHIFT) |
1142 (vswz[2] << R300_FPI0_ARG2C_SHIFT);
1143
1144 rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
1145 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1146 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1147 rp->alu.inst[vpos].inst1 |=
1148 (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
1149 } else assert(0);
1150 } else {
1151 rp->alu.inst[vpos].inst1 |=
1152 (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
1153 }
1154 cs->v_pos = vpos+1;
1155 } else if (spos >= vpos)
1156 rp->alu.inst[spos].inst0 = NOP_INST0;
1157
1158 /* And now FPI2/3 */
1159 rp->alu.inst[spos].inst3 =
1160 ((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
1161 (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1162 (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1163 if (emit_sop) {
1164 rp->alu.inst[spos].inst2 = sop |
1165 sswz[0] << R300_FPI2_ARG0A_SHIFT |
1166 sswz[1] << R300_FPI2_ARG1A_SHIFT |
1167 sswz[2] << R300_FPI2_ARG2A_SHIFT;
1168
1169 if (mask & WRITEMASK_W) {
1170 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1171 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1172 rp->alu.inst[spos].inst3 |=
1173 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
1174 } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1175 rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH;
1176 } else assert(0);
1177 } else {
1178 rp->alu.inst[spos].inst3 |=
1179 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
1180 }
1181 }
1182 cs->s_pos = spos+1;
1183 } else if (vpos >= spos)
1184 rp->alu.inst[vpos].inst2 = NOP_INST2;
1185
1186 return;
1187 }
1188
1189 #if 0
1190 static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr)
1191 {
1192 struct gl_fragment_program *mp = &rp->mesa_program;
1193 GLuint r = undef;
1194
1195 if (!(mp->Base.InputsRead & (1<<attr))) {
1196 ERROR("Attribute %d was not provided!\n", attr);
1197 return undef;
1198 }
1199
1200 REG_SET_TYPE(r, REG_TYPE_INPUT);
1201 REG_SET_INDEX(r, attr);
1202 REG_SET_VALID(r, GL_TRUE);
1203 return r;
1204 }
1205 #endif
1206
1207 static void make_sin_const(struct r300_fragment_program *rp)
1208 {
1209 if(rp->const_sin[0] == -1){
1210 GLfloat cnstv[4];
1211
1212 cnstv[0] = 1.273239545; // 4/PI
1213 cnstv[1] =-0.405284735; // -4/(PI*PI)
1214 cnstv[2] = 3.141592654; // PI
1215 cnstv[3] = 0.2225; // weight
1216 rp->const_sin[0] = emit_const4fv(rp, cnstv);
1217
1218 cnstv[0] = 0.75;
1219 cnstv[1] = 0.0;
1220 cnstv[2] = 0.159154943; // 1/(2*PI)
1221 cnstv[3] = 6.283185307; // 2*PI
1222 rp->const_sin[1] = emit_const4fv(rp, cnstv);
1223 }
1224 }
1225
1226 static GLboolean parse_program(struct r300_fragment_program *rp)
1227 {
1228 struct gl_fragment_program *mp = &rp->mesa_program;
1229 const struct prog_instruction *inst = mp->Base.Instructions;
1230 struct prog_instruction *fpi;
1231 GLuint src[3], dest, temp[2];
1232 GLuint cnst;
1233 int flags, mask = 0;
1234 GLfloat cnstv[4] = {0.0, 0.0, 0.0, 0.0};
1235
1236 if (!inst || inst[0].Opcode == OPCODE_END) {
1237 ERROR("empty program?\n");
1238 return GL_FALSE;
1239 }
1240
1241 for (fpi=mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1242 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1243 flags = PFS_FLAG_SAT;
1244 else
1245 flags = 0;
1246
1247 if (fpi->Opcode != OPCODE_KIL) {
1248 dest = t_dst(rp, fpi->DstReg);
1249 mask = fpi->DstReg.WriteMask;
1250 }
1251
1252 switch (fpi->Opcode) {
1253 case OPCODE_ABS:
1254 src[0] = t_src(rp, fpi->SrcReg[0]);
1255 emit_arith(rp, PFS_OP_MAD, dest, mask,
1256 absolute(src[0]), pfs_one, pfs_zero,
1257 flags);
1258 break;
1259 case OPCODE_ADD:
1260 src[0] = t_src(rp, fpi->SrcReg[0]);
1261 src[1] = t_src(rp, fpi->SrcReg[1]);
1262 emit_arith(rp, PFS_OP_MAD, dest, mask,
1263 src[0], pfs_one, src[1],
1264 flags);
1265 break;
1266 case OPCODE_CMP:
1267 src[0] = t_src(rp, fpi->SrcReg[0]);
1268 src[1] = t_src(rp, fpi->SrcReg[1]);
1269 src[2] = t_src(rp, fpi->SrcReg[2]);
1270 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1271 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1272 */
1273 emit_arith(rp, PFS_OP_CMP, dest, mask,
1274 src[2], src[1], src[0],
1275 flags);
1276 break;
1277 case OPCODE_COS:
1278 /*
1279 * cos using a parabola (see SIN):
1280 * cos(x):
1281 * x = (x/(2*PI))+0.75
1282 * x = frac(x)
1283 * x = (x*2*PI)-PI
1284 * result = sin(x)
1285 */
1286 temp[0] = get_temp_reg(rp);
1287 make_sin_const(rp);
1288 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1289
1290 /* add 0.5*PI and do range reduction */
1291
1292 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1293 swizzle(src[0], X, X, X, X),
1294 swizzle(rp->const_sin[1], Z, Z, Z, Z),
1295 swizzle(rp->const_sin[1], X, X, X, X),
1296 0);
1297
1298 emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1299 swizzle(temp[0], X, X, X, X),
1300 undef,
1301 undef,
1302 0);
1303
1304 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1305 swizzle(temp[0], X, X, X, X),
1306 swizzle(rp->const_sin[1], W, W, W, W), //2*PI
1307 negate(swizzle(rp->const_sin[0], Z, Z, Z, Z)), //-PI
1308 0);
1309
1310 /* SIN */
1311
1312 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1313 swizzle(temp[0], Z, Z, Z, Z),
1314 rp->const_sin[0],
1315 pfs_zero,
1316 0);
1317
1318 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1319 swizzle(temp[0], Y, Y, Y, Y),
1320 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1321 swizzle(temp[0], X, X, X, X),
1322 0);
1323
1324 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1325 swizzle(temp[0], X, X, X, X),
1326 absolute(swizzle(temp[0], X, X, X, X)),
1327 negate(swizzle(temp[0], X, X, X, X)),
1328 0);
1329
1330
1331 emit_arith(rp, PFS_OP_MAD, dest, mask,
1332 swizzle(temp[0], Y, Y, Y, Y),
1333 swizzle(rp->const_sin[0], W, W, W, W),
1334 swizzle(temp[0], X, X, X, X),
1335 flags);
1336
1337 free_temp(rp, temp[0]);
1338 break;
1339 case OPCODE_DP3:
1340 src[0] = t_src(rp, fpi->SrcReg[0]);
1341 src[1] = t_src(rp, fpi->SrcReg[1]);
1342 emit_arith(rp, PFS_OP_DP3, dest, mask,
1343 src[0], src[1], undef,
1344 flags);
1345 break;
1346 case OPCODE_DP4:
1347 src[0] = t_src(rp, fpi->SrcReg[0]);
1348 src[1] = t_src(rp, fpi->SrcReg[1]);
1349 emit_arith(rp, PFS_OP_DP4, dest, mask,
1350 src[0], src[1], undef,
1351 flags);
1352 break;
1353 case OPCODE_DPH:
1354 src[0] = t_src(rp, fpi->SrcReg[0]);
1355 src[1] = t_src(rp, fpi->SrcReg[1]);
1356 /* src0.xyz1 -> temp
1357 * DP4 dest, temp, src1
1358 */
1359 #if 0
1360 temp[0] = get_temp_reg(rp);
1361 src[0].s_swz = SWIZZLE_ONE;
1362 emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1363 src[0], pfs_one, pfs_zero,
1364 0);
1365 emit_arith(rp, PFS_OP_DP4, dest, mask,
1366 temp[0], src[1], undef,
1367 flags);
1368 free_temp(rp, temp[0]);
1369 #else
1370 emit_arith(rp, PFS_OP_DP4, dest, mask,
1371 swizzle(src[0], X, Y, Z, ONE), src[1],
1372 undef, flags);
1373 #endif
1374 break;
1375 case OPCODE_DST:
1376 src[0] = t_src(rp, fpi->SrcReg[0]);
1377 src[1] = t_src(rp, fpi->SrcReg[1]);
1378 /* dest.y = src0.y * src1.y */
1379 if (mask & WRITEMASK_Y)
1380 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1381 keep(src[0]), keep(src[1]),
1382 pfs_zero, flags);
1383 /* dest.z = src0.z */
1384 if (mask & WRITEMASK_Z)
1385 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Z,
1386 src[0], pfs_one, pfs_zero, flags);
1387 /* result.x = 1.0
1388 * result.w = src1.w */
1389 if (mask & WRITEMASK_XW) {
1390 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat*/
1391 emit_arith(rp, PFS_OP_MAD, dest,
1392 mask & WRITEMASK_XW,
1393 src[1], pfs_one, pfs_zero,
1394 flags);
1395 }
1396 break;
1397 case OPCODE_EX2:
1398 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1399 emit_arith(rp, PFS_OP_EX2, dest, mask,
1400 src[0], undef, undef,
1401 flags);
1402 break;
1403 case OPCODE_FLR:
1404 src[0] = t_src(rp, fpi->SrcReg[0]);
1405 temp[0] = get_temp_reg(rp);
1406 /* FRC temp, src0
1407 * MAD dest, src0, 1.0, -temp
1408 */
1409 emit_arith(rp, PFS_OP_FRC, temp[0], mask,
1410 keep(src[0]), undef, undef,
1411 0);
1412 emit_arith(rp, PFS_OP_MAD, dest, mask,
1413 src[0], pfs_one, negate(temp[0]),
1414 flags);
1415 free_temp(rp, temp[0]);
1416 break;
1417 case OPCODE_FRC:
1418 src[0] = t_src(rp, fpi->SrcReg[0]);
1419 emit_arith(rp, PFS_OP_FRC, dest, mask,
1420 src[0], undef, undef,
1421 flags);
1422 break;
1423 case OPCODE_KIL:
1424 emit_tex(rp, fpi, R300_FPITX_OP_KIL);
1425 break;
1426 case OPCODE_LG2:
1427 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1428 emit_arith(rp, PFS_OP_LG2, dest, mask,
1429 src[0], undef, undef,
1430 flags);
1431 break;
1432 case OPCODE_LIT:
1433 /* LIT
1434 * if (s.x < 0) t.x = 0; else t.x = s.x;
1435 * if (s.y < 0) t.y = 0; else t.y = s.y;
1436 * if (s.w > 128.0) t.w = 128.0; else t.w = s.w;
1437 * if (s.w < -128.0) t.w = -128.0; else t.w = s.w;
1438 * r.x = 1.0
1439 * if (t.x > 0) r.y = pow(t.y, t.w); else r.y = 0;
1440 * Also r.y = 0 if t.y < 0
1441 * For the t.x > 0 FGLRX use the CMPH opcode which
1442 * change the compare to (t.x + 0.5) > 0.5 we may
1443 * save one instruction by doing CMP -t.x
1444 */
1445 cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001;
1446 src[0] = t_src(rp, fpi->SrcReg[0]);
1447 temp[0] = get_temp_reg(rp);
1448 cnst = emit_const4fv(rp, cnstv);
1449 emit_arith(rp, PFS_OP_CMP, temp[0],
1450 WRITEMASK_X | WRITEMASK_Y,
1451 src[0], pfs_zero, src[0], flags);
1452 emit_arith(rp, PFS_OP_MIN, temp[0], WRITEMASK_Z,
1453 swizzle(keep(src[0]), W, W, W, W),
1454 cnst, undef, flags);
1455 emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1456 swizzle(temp[0], Y, Y, Y, Y),
1457 undef, undef, flags);
1458 emit_arith(rp, PFS_OP_MAX, temp[0], WRITEMASK_Z,
1459 temp[0], negate(cnst), undef, flags);
1460 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1461 temp[0], swizzle(temp[0], Z, Z, Z, Z),
1462 pfs_zero, flags);
1463 emit_arith(rp, PFS_OP_EX2, temp[0], WRITEMASK_W,
1464 temp[0], undef, undef, flags);
1465 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1466 swizzle(keep(temp[0]), X, X, X, X),
1467 pfs_one, pfs_zero, flags);
1468 #if 0
1469 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1470 temp[0], pfs_one, pfs_half, flags);
1471 emit_arith(rp, PFS_OP_CMPH, temp[0], WRITEMASK_Z,
1472 swizzle(keep(temp[0]), W, W, W, W),
1473 pfs_zero, swizzle(keep(temp[0]), X, X, X, X),
1474 flags);
1475 #else
1476 emit_arith(rp, PFS_OP_CMP, temp[0], WRITEMASK_Z,
1477 pfs_zero,
1478 swizzle(keep(temp[0]), W, W, W, W),
1479 negate(swizzle(keep(temp[0]), X, X, X, X)),
1480 flags);
1481 #endif
1482 emit_arith(rp, PFS_OP_CMP, dest, WRITEMASK_Z,
1483 pfs_zero, temp[0],
1484 negate(swizzle(keep(temp[0]), Y, Y, Y, Y)),
1485 flags);
1486 emit_arith(rp, PFS_OP_MAD, dest,
1487 WRITEMASK_X | WRITEMASK_W,
1488 pfs_one,
1489 pfs_one,
1490 pfs_zero,
1491 flags);
1492 free_temp(rp, temp[0]);
1493 break;
1494 case OPCODE_LRP:
1495 src[0] = t_src(rp, fpi->SrcReg[0]);
1496 src[1] = t_src(rp, fpi->SrcReg[1]);
1497 src[2] = t_src(rp, fpi->SrcReg[2]);
1498 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1499 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1500 * MAD temp, -tmp0, tmp2, tmp2
1501 * MAD result, tmp0, tmp1, temp
1502 */
1503 temp[0] = get_temp_reg(rp);
1504 emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1505 negate(keep(src[0])), keep(src[2]), src[2],
1506 0);
1507 emit_arith(rp, PFS_OP_MAD, dest, mask,
1508 src[0], src[1], temp[0],
1509 flags);
1510 free_temp(rp, temp[0]);
1511 break;
1512 case OPCODE_MAD:
1513 src[0] = t_src(rp, fpi->SrcReg[0]);
1514 src[1] = t_src(rp, fpi->SrcReg[1]);
1515 src[2] = t_src(rp, fpi->SrcReg[2]);
1516 emit_arith(rp, PFS_OP_MAD, dest, mask,
1517 src[0], src[1], src[2],
1518 flags);
1519 break;
1520 case OPCODE_MAX:
1521 src[0] = t_src(rp, fpi->SrcReg[0]);
1522 src[1] = t_src(rp, fpi->SrcReg[1]);
1523 emit_arith(rp, PFS_OP_MAX, dest, mask,
1524 src[0], src[1], undef,
1525 flags);
1526 break;
1527 case OPCODE_MIN:
1528 src[0] = t_src(rp, fpi->SrcReg[0]);
1529 src[1] = t_src(rp, fpi->SrcReg[1]);
1530 emit_arith(rp, PFS_OP_MIN, dest, mask,
1531 src[0], src[1], undef,
1532 flags);
1533 break;
1534 case OPCODE_MOV:
1535 case OPCODE_SWZ:
1536 src[0] = t_src(rp, fpi->SrcReg[0]);
1537 emit_arith(rp, PFS_OP_MAD, dest, mask,
1538 src[0], pfs_one, pfs_zero,
1539 flags);
1540 break;
1541 case OPCODE_MUL:
1542 src[0] = t_src(rp, fpi->SrcReg[0]);
1543 src[1] = t_src(rp, fpi->SrcReg[1]);
1544 emit_arith(rp, PFS_OP_MAD, dest, mask,
1545 src[0], src[1], pfs_zero,
1546 flags);
1547 break;
1548 case OPCODE_POW:
1549 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1550 src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
1551 temp[0] = get_temp_reg(rp);
1552 emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1553 src[0], undef, undef,
1554 0);
1555 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1556 temp[0], src[1], pfs_zero,
1557 0);
1558 emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1559 temp[0], undef, undef,
1560 0);
1561 free_temp(rp, temp[0]);
1562 break;
1563 case OPCODE_RCP:
1564 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1565 emit_arith(rp, PFS_OP_RCP, dest, mask,
1566 src[0], undef, undef,
1567 flags);
1568 break;
1569 case OPCODE_RSQ:
1570 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1571 emit_arith(rp, PFS_OP_RSQ, dest, mask,
1572 absolute(src[0]), pfs_zero, pfs_zero,
1573 flags);
1574 break;
1575 case OPCODE_SCS:
1576 /*
1577 * scs using a parabola :
1578 * scs(x):
1579 * result.x = sin(-abs(x)+0.5*PI) (cos)
1580 * result.y = sin(x) (sin)
1581 *
1582 */
1583 temp[0] = get_temp_reg(rp);
1584 temp[1] = get_temp_reg(rp);
1585 make_sin_const(rp);
1586 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1587
1588 /* x = -abs(x)+0.5*PI */
1589 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1590 swizzle(rp->const_sin[0], Z, Z, Z, Z), //PI
1591 pfs_half,
1592 negate(abs(swizzle(keep(src[0]), X, X, X, X))),
1593 0);
1594
1595 /* C*x (sin) */
1596 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1597 swizzle(rp->const_sin[0], Y, Y, Y, Y),
1598 swizzle(keep(src[0]), X, X, X, X),
1599 pfs_zero,
1600 0);
1601
1602 /* B*x, C*x (cos) */
1603 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1604 swizzle(temp[0], Z, Z, Z, Z),
1605 rp->const_sin[0],
1606 pfs_zero,
1607 0);
1608
1609 /* B*x (sin) */
1610 emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1611 swizzle(rp->const_sin[0], X, X, X, X),
1612 keep(src[0]),
1613 pfs_zero,
1614 0);
1615
1616 /* y = B*x + C*x*abs(x) (sin)*/
1617 emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1618 absolute(src[0]),
1619 swizzle(temp[0], W, W, W, W),
1620 swizzle(temp[1], W, W, W, W),
1621 0);
1622
1623 /* y = B*x + C*x*abs(x) (cos)*/
1624 emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1625 swizzle(temp[0], Y, Y, Y, Y),
1626 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1627 swizzle(temp[0], X, X, X, X),
1628 0);
1629
1630 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1631 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1632 swizzle(temp[1], W, Z, Y, X),
1633 absolute(swizzle(temp[1], W, Z, Y, X)),
1634 negate(swizzle(temp[1], W, Z, Y, X)),
1635
1636 0);
1637
1638 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1639 emit_arith(rp, PFS_OP_MAD, dest, mask & (WRITEMASK_X | WRITEMASK_Y),
1640 temp[0],
1641 swizzle(rp->const_sin[0], W, W, W, W),
1642 swizzle(temp[1], W, Z, Y, X),
1643 flags);
1644
1645 free_temp(rp, temp[0]);
1646 free_temp(rp, temp[1]);
1647 break;
1648 case OPCODE_SGE:
1649 src[0] = t_src(rp, fpi->SrcReg[0]);
1650 src[1] = t_src(rp, fpi->SrcReg[1]);
1651 temp[0] = get_temp_reg(rp);
1652 /* temp = src0 - src1
1653 * dest.c = (temp.c < 0.0) ? 0 : 1
1654 */
1655 emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1656 src[0], pfs_one, negate(src[1]),
1657 0);
1658 emit_arith(rp, PFS_OP_CMP, dest, mask,
1659 pfs_one, pfs_zero, temp[0],
1660 0);
1661 free_temp(rp, temp[0]);
1662 break;
1663 case OPCODE_SIN:
1664 /*
1665 * using a parabola:
1666 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1667 * extra precision is obtained by weighting against
1668 * itself squared.
1669 */
1670
1671 temp[0] = get_temp_reg(rp);
1672 make_sin_const(rp);
1673 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1674
1675
1676 /* do range reduction */
1677
1678 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1679 swizzle(keep(src[0]), X, X, X, X),
1680 swizzle(rp->const_sin[1], Z, Z, Z, Z),
1681 pfs_half,
1682 0);
1683
1684 emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1685 swizzle(temp[0], X, X, X, X),
1686 undef,
1687 undef,
1688 0);
1689
1690 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1691 swizzle(temp[0], X, X, X, X),
1692 swizzle(rp->const_sin[1], W, W, W, W), //2*PI
1693 negate(swizzle(rp->const_sin[0], Z, Z, Z, Z)), //PI
1694 0);
1695
1696 /* SIN */
1697
1698 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1699 swizzle(temp[0], Z, Z, Z, Z),
1700 rp->const_sin[0],
1701 pfs_zero,
1702 0);
1703
1704 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1705 swizzle(temp[0], Y, Y, Y, Y),
1706 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1707 swizzle(temp[0], X, X, X, X),
1708 0);
1709
1710 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1711 swizzle(temp[0], X, X, X, X),
1712 absolute(swizzle(temp[0], X, X, X, X)),
1713 negate(swizzle(temp[0], X, X, X, X)),
1714 0);
1715
1716
1717 emit_arith(rp, PFS_OP_MAD, dest, mask,
1718 swizzle(temp[0], Y, Y, Y, Y),
1719 swizzle(rp->const_sin[0], W, W, W, W),
1720 swizzle(temp[0], X, X, X, X),
1721 flags);
1722
1723 free_temp(rp, temp[0]);
1724 break;
1725 case OPCODE_SLT:
1726 src[0] = t_src(rp, fpi->SrcReg[0]);
1727 src[1] = t_src(rp, fpi->SrcReg[1]);
1728 temp[0] = get_temp_reg(rp);
1729 /* temp = src0 - src1
1730 * dest.c = (temp.c < 0.0) ? 1 : 0
1731 */
1732 emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1733 src[0], pfs_one, negate(src[1]),
1734 0);
1735 emit_arith(rp, PFS_OP_CMP, dest, mask,
1736 pfs_zero, pfs_one, temp[0],
1737 0);
1738 free_temp(rp, temp[0]);
1739 break;
1740 case OPCODE_SUB:
1741 src[0] = t_src(rp, fpi->SrcReg[0]);
1742 src[1] = t_src(rp, fpi->SrcReg[1]);
1743 emit_arith(rp, PFS_OP_MAD, dest, mask,
1744 src[0], pfs_one, negate(src[1]),
1745 flags);
1746 break;
1747 case OPCODE_TEX:
1748 emit_tex(rp, fpi, R300_FPITX_OP_TEX);
1749 break;
1750 case OPCODE_TXB:
1751 emit_tex(rp, fpi, R300_FPITX_OP_TXB);
1752 break;
1753 case OPCODE_TXP:
1754 emit_tex(rp, fpi, R300_FPITX_OP_TXP);
1755 break;
1756 case OPCODE_XPD: {
1757 src[0] = t_src(rp, fpi->SrcReg[0]);
1758 src[1] = t_src(rp, fpi->SrcReg[1]);
1759 temp[0] = get_temp_reg(rp);
1760 /* temp = src0.zxy * src1.yzx */
1761 emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_XYZ,
1762 swizzle(keep(src[0]), Z, X, Y, W),
1763 swizzle(keep(src[1]), Y, Z, X, W),
1764 pfs_zero,
1765 0);
1766 /* dest.xyz = src0.yzx * src1.zxy - temp
1767 * dest.w = undefined
1768 * */
1769 emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ,
1770 swizzle(src[0], Y, Z, X, W),
1771 swizzle(src[1], Z, X, Y, W),
1772 negate(temp[0]),
1773 flags);
1774 /* cleanup */
1775 free_temp(rp, temp[0]);
1776 break;
1777 }
1778 default:
1779 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1780 break;
1781 }
1782
1783 if (rp->error)
1784 return GL_FALSE;
1785
1786 }
1787
1788 return GL_TRUE;
1789 }
1790
1791 /* - Init structures
1792 * - Determine what hwregs each input corresponds to
1793 */
1794 static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
1795 {
1796 struct r300_pfs_compile_state *cs = NULL;
1797 struct gl_fragment_program *mp = &rp->mesa_program;
1798 struct prog_instruction *fpi;
1799 GLuint InputsRead = mp->Base.InputsRead;
1800 GLuint temps_used = 0; /* for rp->temps[] */
1801 int i,j;
1802
1803 /* New compile, reset tracking data */
1804 rp->optimization = driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
1805 rp->translated = GL_FALSE;
1806 rp->error = GL_FALSE;
1807 rp->cs = cs = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
1808 rp->tex.length = 0;
1809 rp->cur_node = 0;
1810 rp->first_node_has_tex = 0;
1811 rp->const_nr = 0;
1812 rp->param_nr = 0;
1813 rp->params_uptodate = GL_FALSE;
1814 rp->max_temp_idx = 0;
1815 rp->node[0].alu_end = -1;
1816 rp->node[0].tex_end = -1;
1817 rp->const_sin[0] = -1;
1818
1819 _mesa_memset(cs, 0, sizeof(*rp->cs));
1820 for (i=0;i<PFS_MAX_ALU_INST;i++) {
1821 for (j=0;j<3;j++) {
1822 cs->slot[i].vsrc[j] = SRC_CONST;
1823 cs->slot[i].ssrc[j] = SRC_CONST;
1824 }
1825 }
1826
1827 /* Work out what temps the Mesa inputs correspond to, this must match
1828 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1829 * configures itself based on the fragprog's InputsRead
1830 *
1831 * NOTE: this depends on get_hw_temp() allocating registers in order,
1832 * starting from register 0.
1833 */
1834
1835 /* Texcoords come first */
1836 for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
1837 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
1838 cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
1839 cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp);
1840 }
1841 }
1842 InputsRead &= ~FRAG_BITS_TEX_ANY;
1843
1844 /* fragment position treated as a texcoord */
1845 if (InputsRead & FRAG_BIT_WPOS) {
1846 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
1847 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp);
1848 }
1849 InputsRead &= ~FRAG_BIT_WPOS;
1850
1851 /* Then primary colour */
1852 if (InputsRead & FRAG_BIT_COL0) {
1853 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
1854 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp);
1855 }
1856 InputsRead &= ~FRAG_BIT_COL0;
1857
1858 /* Secondary color */
1859 if (InputsRead & FRAG_BIT_COL1) {
1860 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
1861 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp);
1862 }
1863 InputsRead &= ~FRAG_BIT_COL1;
1864
1865 /* Anything else */
1866 if (InputsRead) {
1867 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
1868 InputsRead);
1869 /* force read from hwreg 0 for now */
1870 for (i=0;i<32;i++)
1871 if (InputsRead & (1<<i)) cs->inputs[i].reg = 0;
1872 }
1873
1874 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
1875 * That way, we can free up the reg when it's no longer needed
1876 */
1877 if (!mp->Base.Instructions) {
1878 ERROR("No instructions found in program\n");
1879 return;
1880 }
1881
1882 for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) {
1883 int idx;
1884
1885 for (i=0;i<3;i++) {
1886 idx = fpi->SrcReg[i].Index;
1887 switch (fpi->SrcReg[i].File) {
1888 case PROGRAM_TEMPORARY:
1889 if (!(temps_used & (1<<idx))) {
1890 cs->temps[idx].reg = -1;
1891 cs->temps[idx].refcount = 1;
1892 temps_used |= (1 << idx);
1893 } else
1894 cs->temps[idx].refcount++;
1895 break;
1896 case PROGRAM_INPUT:
1897 cs->inputs[idx].refcount++;
1898 break;
1899 default: break;
1900 }
1901 }
1902
1903 idx = fpi->DstReg.Index;
1904 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
1905 if (!(temps_used & (1<<idx))) {
1906 cs->temps[idx].reg = -1;
1907 cs->temps[idx].refcount = 1;
1908 temps_used |= (1 << idx);
1909 } else
1910 cs->temps[idx].refcount++;
1911 }
1912 }
1913 cs->temp_in_use = temps_used;
1914 }
1915
1916 static void update_params(struct r300_fragment_program *rp)
1917 {
1918 struct gl_fragment_program *mp = &rp->mesa_program;
1919 int i;
1920
1921 /* Ask Mesa nicely to fill in ParameterValues for us */
1922 if (rp->param_nr)
1923 _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
1924
1925 for (i=0;i<rp->param_nr;i++)
1926 COPY_4V(rp->constant[rp->param[i].idx], rp->param[i].values);
1927
1928 rp->params_uptodate = GL_TRUE;
1929 }
1930
1931 void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
1932 {
1933 struct r300_pfs_compile_state *cs = NULL;
1934
1935 if (!rp->translated) {
1936
1937 init_program(r300, rp);
1938 cs = rp->cs;
1939
1940 if (parse_program(rp) == GL_FALSE) {
1941 dump_program(rp);
1942 return;
1943 }
1944
1945 /* Finish off */
1946 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
1947 rp->node[rp->cur_node].alu_end =
1948 cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
1949 if (rp->node[rp->cur_node].tex_end < 0)
1950 rp->node[rp->cur_node].tex_end = 0;
1951 rp->alu_offset = 0;
1952 rp->alu_end = cs->v_pos - 1;
1953 rp->tex_offset = 0;
1954 rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0;
1955 assert(rp->node[rp->cur_node].alu_end >= 0);
1956 assert(rp->alu_end >= 0);
1957
1958 rp->translated = GL_TRUE;
1959 if (0) dump_program(rp);
1960 }
1961
1962 update_params(rp);
1963 }
1964
1965 /* just some random things... */
1966 static void dump_program(struct r300_fragment_program *rp)
1967 {
1968 int i;
1969 static int pc = 0;
1970
1971 fprintf(stderr, "pc=%d*************************************\n", pc++);
1972
1973 fprintf(stderr, "Mesa program:\n");
1974 fprintf(stderr, "-------------\n");
1975 _mesa_print_program(&rp->mesa_program.Base);
1976 fflush(stdout);
1977
1978 fprintf(stderr, "Hardware program\n");
1979 fprintf(stderr, "----------------\n");
1980
1981 fprintf(stderr, "tex:\n");
1982
1983 for(i=0;i<rp->tex.length;i++) {
1984 fprintf(stderr, "%08x\n", rp->tex.inst[i]);
1985 }
1986
1987 for (i=0;i<(rp->cur_node+1);i++) {
1988 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
1989 "alu_end: %d, tex_end: %d\n", i,
1990 rp->node[i].alu_offset,
1991 rp->node[i].tex_offset,
1992 rp->node[i].alu_end,
1993 rp->node[i].tex_end);
1994 }
1995
1996 fprintf(stderr, "%08x\n",
1997 ((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2)));
1998 for (i=0;i<=rp->tex_end;i++)
1999 fprintf(stderr, "%08x\n", rp->tex.inst[i]);
2000
2001 /* dump program in pretty_print_command_stream.tcl-readable format */
2002 fprintf(stderr, "%08x\n",
2003 ((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2)));
2004 for (i=0;i<=rp->alu_end;i++)
2005 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0);
2006
2007 fprintf(stderr, "%08x\n",
2008 ((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2)));
2009 for (i=0;i<=rp->alu_end;i++)
2010 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1);
2011
2012 fprintf(stderr, "%08x\n",
2013 ((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2)));
2014 for (i=0;i<=rp->alu_end;i++)
2015 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2);
2016
2017 fprintf(stderr, "%08x\n",
2018 ((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2)));
2019 for (i=0;i<=rp->alu_end;i++)
2020 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3);
2021
2022 fprintf(stderr, "00000000\n");
2023 }