91ec4f855c20fc306a70dbc202199e547443ed2d
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /*
29 * Authors:
30 * Ben Skeggs <darktama@iinet.net.au>
31 */
32
33 /*TODO'S
34 *
35 * - COS/SIN/SCS/LIT instructions
36 * - Depth write, WPOS/FOGC inputs
37 * - FogOption
38 * - Negate on individual components (implement in swizzle code?)
39 * - Verify results of opcodes for accuracy, I've only checked them
40 * in specific cases.
41 * - and more...
42 */
43
44 #include "glheader.h"
45 #include "macros.h"
46 #include "enums.h"
47
48 #include "program.h"
49 #include "program_instruction.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
52 #include "r300_reg.h"
53
54 #define PFS_INVAL 0xFFFFFFFF
55 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
56
57 static void dump_program(struct r300_fragment_program *rp);
58 static void emit_arith(struct r300_fragment_program *rp, int op,
59 pfs_reg_t dest, int mask,
60 pfs_reg_t src0, pfs_reg_t src1, pfs_reg_t src2,
61 int flags);
62
63 /***************************************
64 * begin: useful data structions for fragment program generation
65 ***************************************/
66
67 /* description of r300 native hw instructions */
68 static const struct {
69 const char *name;
70 int argc;
71 int v_op;
72 int s_op;
73 } r300_fpop[] = {
74 { "MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD },
75 { "DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4 },
76 { "DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4 },
77 { "MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN },
78 { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX },
79 { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP },
80 { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC },
81 { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 },
82 { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 },
83 { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP },
84 { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ },
85 { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL },
86 { "CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL },
87 };
88
89 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
90 SWIZZLE_##y, \
91 SWIZZLE_##z, \
92 SWIZZLE_ZERO))
93
94 #define SLOT_VECTOR (1<<0)
95 #define SLOT_SCALAR (1<<3)
96 #define SLOT_BOTH (SLOT_VECTOR|SLOT_SCALAR)
97
98 /* vector swizzles r300 can support natively, with a couple of
99 * cases we handle specially
100 *
101 * pfs_reg_t.v_swz/pfs_reg_t.s_swz is an index into this table
102 **/
103 static const struct r300_pfs_swizzle {
104 GLuint hash; /* swizzle value this matches */
105 GLuint base; /* base value for hw swizzle */
106 GLuint stride; /* difference in base between arg0/1/2 */
107 GLuint flags;
108 } v_swiz[] = {
109 /* native swizzles */
110 { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR },
111 { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR },
112 { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR },
113 { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR },
114 { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SCALAR },
115 { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR },
116 { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR },
117 { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
118 { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
119 { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
120 { PFS_INVAL, R300_FPI0_ARGC_HALF, 0, 0},
121 { PFS_INVAL, 0, 0, 0},
122 };
123 #define SWIZZLE_XYZ 0
124 #define SWIZZLE_XXX 1
125 #define SWIZZLE_YYY 2
126 #define SWIZZLE_ZZZ 3
127 #define SWIZZLE_WWW 4
128 #define SWIZZLE_YZX 5
129 #define SWIZZLE_ZXY 6
130 #define SWIZZLE_WZY 7
131 #define SWIZZLE_111 8
132 #define SWIZZLE_000 9
133 #define SWIZZLE_HHH 10
134
135 #define SWZ_X_MASK (7 << 0)
136 #define SWZ_Y_MASK (7 << 3)
137 #define SWZ_Z_MASK (7 << 6)
138 #define SWZ_W_MASK (7 << 9)
139 /* used during matching of non-native swizzles */
140 static const struct {
141 GLuint hash; /* used to mask matching swizzle components */
142 int mask; /* actual outmask */
143 int count; /* count of components matched */
144 } s_mask[] = {
145 { SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK, 1|2|4, 3},
146 { SWZ_X_MASK|SWZ_Y_MASK, 1|2, 2},
147 { SWZ_X_MASK|SWZ_Z_MASK, 1|4, 2},
148 { SWZ_Y_MASK|SWZ_Z_MASK, 2|4, 2},
149 { SWZ_X_MASK, 1, 1},
150 { SWZ_Y_MASK, 2, 1},
151 { SWZ_Z_MASK, 4, 1},
152 { PFS_INVAL, PFS_INVAL, PFS_INVAL}
153 };
154
155 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
156 static const struct {
157 int base; /* hw value of swizzle */
158 int stride; /* difference between SRC0/1/2 */
159 GLuint flags;
160 } s_swiz[] = {
161 { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR },
162 { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR },
163 { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR },
164 { R300_FPI2_ARGA_SRC0A , 1, SLOT_SCALAR },
165 { R300_FPI2_ARGA_ZERO , 0, 0 },
166 { R300_FPI2_ARGA_ONE , 0, 0 },
167 { R300_FPI2_ARGA_HALF , 0, 0 }
168 };
169 #define SWIZZLE_HALF 6
170
171 /* boiler-plate reg, for convenience */
172 static const pfs_reg_t undef = {
173 type: REG_TYPE_TEMP,
174 index: 0,
175 v_swz: SWIZZLE_XYZ,
176 s_swz: SWIZZLE_W,
177 negate_v: 0,
178 negate_s: 0,
179 absolute: 0,
180 no_use: GL_FALSE,
181 valid: GL_FALSE
182 };
183
184 /* constant one source */
185 static const pfs_reg_t pfs_one = {
186 type: REG_TYPE_CONST,
187 index: 0,
188 v_swz: SWIZZLE_111,
189 s_swz: SWIZZLE_ONE,
190 valid: GL_TRUE
191 };
192
193 /* constant half source */
194 static const pfs_reg_t pfs_half = {
195 type: REG_TYPE_CONST,
196 index: 0,
197 v_swz: SWIZZLE_HHH,
198 s_swz: SWIZZLE_HALF,
199 valid: GL_TRUE
200 };
201
202 /* constant zero source */
203 static const pfs_reg_t pfs_zero = {
204 type: REG_TYPE_CONST,
205 index: 0,
206 v_swz: SWIZZLE_000,
207 s_swz: SWIZZLE_ZERO,
208 valid: GL_TRUE
209 };
210
211 /***************************************
212 * end: data structures
213 ***************************************/
214
215 #define ERROR(fmt, args...) do { \
216 fprintf(stderr, "%s::%s(): " fmt "\n",\
217 __FILE__, __func__, ##args); \
218 rp->error = GL_TRUE; \
219 } while(0)
220
221 static int get_hw_temp(struct r300_fragment_program *rp)
222 {
223 COMPILE_STATE;
224 int r = ffs(~cs->hwreg_in_use);
225 if (!r) {
226 ERROR("Out of hardware temps\n");
227 return 0;
228 }
229
230 cs->hwreg_in_use |= (1 << --r);
231 if (r > rp->max_temp_idx)
232 rp->max_temp_idx = r;
233
234 return r;
235 }
236
237 static int get_hw_temp_tex(struct r300_fragment_program *rp)
238 {
239 COMPILE_STATE;
240 int r;
241
242 r = ffs(~(cs->hwreg_in_use | cs->used_in_node));
243 if (!r)
244 return get_hw_temp(rp); /* Will cause an indirection */
245
246 cs->hwreg_in_use |= (1 << --r);
247 if (r > rp->max_temp_idx)
248 rp->max_temp_idx = r;
249
250 return r;
251 }
252
253 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
254 {
255 COMPILE_STATE;
256 cs->hwreg_in_use &= ~(1<<idx);
257 }
258
259 static pfs_reg_t get_temp_reg(struct r300_fragment_program *rp)
260 {
261 COMPILE_STATE;
262 pfs_reg_t r = undef;
263
264 r.index = ffs(~cs->temp_in_use);
265 if (!r.index) {
266 ERROR("Out of program temps\n");
267 return r;
268 }
269 cs->temp_in_use |= (1 << --r.index);
270
271 cs->temps[r.index].refcount = 0xFFFFFFFF;
272 cs->temps[r.index].reg = -1;
273 r.valid = GL_TRUE;
274 return r;
275 }
276
277 static pfs_reg_t get_temp_reg_tex(struct r300_fragment_program *rp)
278 {
279 COMPILE_STATE;
280 pfs_reg_t r = undef;
281
282 r.index = ffs(~cs->temp_in_use);
283 if (!r.index) {
284 ERROR("Out of program temps\n");
285 return r;
286 }
287 cs->temp_in_use |= (1 << --r.index);
288
289 cs->temps[r.index].refcount = 0xFFFFFFFF;
290 cs->temps[r.index].reg = get_hw_temp_tex(rp);
291 r.valid = GL_TRUE;
292 return r;
293 }
294
295 static void free_temp(struct r300_fragment_program *rp, pfs_reg_t r)
296 {
297 COMPILE_STATE;
298 if (!(cs->temp_in_use & (1<<r.index))) return;
299
300 if (r.type == REG_TYPE_TEMP) {
301 free_hw_temp(rp, cs->temps[r.index].reg);
302 cs->temps[r.index].reg = -1;
303 cs->temp_in_use &= ~(1<<r.index);
304 } else if (r.type == REG_TYPE_INPUT) {
305 free_hw_temp(rp, cs->inputs[r.index].reg);
306 cs->inputs[r.index].reg = -1;
307 }
308 }
309
310 static pfs_reg_t emit_param4fv(struct r300_fragment_program *rp,
311 GLfloat *values)
312 {
313 pfs_reg_t r = undef;
314 r.type = REG_TYPE_CONST;
315 int pidx;
316
317 pidx = rp->param_nr++;
318 r.index = rp->const_nr++;
319 if (pidx >= PFS_NUM_CONST_REGS || r.index >= PFS_NUM_CONST_REGS) {
320 ERROR("Out of const/param slots!\n");
321 return r;
322 }
323
324 rp->param[pidx].idx = r.index;
325 rp->param[pidx].values = values;
326 rp->params_uptodate = GL_FALSE;
327
328 r.valid = GL_TRUE;
329 return r;
330 }
331
332 static pfs_reg_t emit_const4fv(struct r300_fragment_program *rp, GLfloat *cp)
333 {
334 pfs_reg_t r = undef;
335 r.type = REG_TYPE_CONST;
336
337 r.index = rp->const_nr++;
338 if (r.index >= PFS_NUM_CONST_REGS) {
339 ERROR("Out of hw constants!\n");
340 return r;
341 }
342
343 COPY_4V(rp->constant[r.index], cp);
344 r.valid = GL_TRUE;
345 return r;
346 }
347
348 static __inline pfs_reg_t negate(pfs_reg_t r)
349 {
350 r.negate_v = 1;
351 r.negate_s = 1;
352 return r;
353 }
354
355 /* Hack, to prevent clobbering sources used multiple times when
356 * emulating non-native instructions
357 */
358 static __inline pfs_reg_t keep(pfs_reg_t r)
359 {
360 r.no_use = GL_TRUE;
361 return r;
362 }
363
364 static __inline pfs_reg_t absolute(pfs_reg_t r)
365 {
366 r.absolute = 1;
367 return r;
368 }
369
370 static int swz_native(struct r300_fragment_program *rp,
371 pfs_reg_t src, pfs_reg_t *r, GLuint arbneg)
372 {
373 /* Native swizzle, nothing to see here */
374 src.negate_s = (arbneg >> 3) & 1;
375
376 if ((arbneg & 0x7) == 0x0) {
377 src.negate_v = 0;
378 *r = src;
379 } else if ((arbneg & 0x7) == 0x7) {
380 src.negate_v = 1;
381 *r = src;
382 } else {
383 if (!r->valid)
384 *r = get_temp_reg(rp);
385 src.negate_v = 1;
386 emit_arith(rp, PFS_OP_MAD, *r, arbneg & 0x7,
387 keep(src), pfs_one, pfs_zero, 0);
388 src.negate_v = 0;
389 emit_arith(rp, PFS_OP_MAD, *r,
390 (arbneg ^ 0x7) | WRITEMASK_W,
391 src, pfs_one, pfs_zero, 0);
392 }
393
394 return 3;
395 }
396
397 static int swz_emit_partial(struct r300_fragment_program *rp, pfs_reg_t src,
398 pfs_reg_t *r, int mask, int mc, GLuint arbneg)
399 {
400 GLuint tmp;
401 GLuint wmask = 0;
402
403 if (!r->valid)
404 *r = get_temp_reg(rp);
405
406 /* A partial match, src.v_swz/mask define what parts of the
407 * desired swizzle we match */
408 if (mc + s_mask[mask].count == 3) {
409 wmask = WRITEMASK_W;
410 src.negate_s = (arbneg >> 3) & 1;
411 }
412
413 tmp = arbneg & s_mask[mask].mask;
414 if (tmp) {
415 tmp = tmp ^ s_mask[mask].mask;
416 if (tmp) {
417 src.negate_v = 1;
418 emit_arith(rp, PFS_OP_MAD, *r,
419 arbneg & s_mask[mask].mask,
420 keep(src), pfs_one, pfs_zero, 0);
421 src.negate_v = 0;
422 if (!wmask) src.no_use = GL_TRUE;
423 else src.no_use = GL_FALSE;
424 emit_arith(rp, PFS_OP_MAD, *r, tmp | wmask,
425 src, pfs_one, pfs_zero, 0);
426 } else {
427 src.negate_v = 1;
428 if (!wmask) src.no_use = GL_TRUE;
429 else src.no_use = GL_FALSE;
430 emit_arith(rp, PFS_OP_MAD, *r,
431 (arbneg & s_mask[mask].mask) | wmask,
432 src, pfs_one, pfs_zero, 0);
433 src.negate_v = 0;
434 }
435 } else {
436 if (!wmask) src.no_use = GL_TRUE;
437 else src.no_use = GL_FALSE;
438 emit_arith(rp, PFS_OP_MAD, *r,
439 s_mask[mask].mask | wmask,
440 src, pfs_one, pfs_zero, 0);
441 }
442
443 return s_mask[mask].count;
444 }
445
446 #define swizzle(r, x, y, z, w) do_swizzle(rp, r, \
447 ((SWIZZLE_##x<<0)| \
448 (SWIZZLE_##y<<3)| \
449 (SWIZZLE_##z<<6)| \
450 (SWIZZLE_##w<<9)), \
451 0)
452
453 static pfs_reg_t do_swizzle(struct r300_fragment_program *rp,
454 pfs_reg_t src, GLuint arbswz, GLuint arbneg)
455 {
456 pfs_reg_t r = undef;
457
458 int c_mask = 0;
459 int v_matched = 0;
460
461 /* If swizzling from something without an XYZW native swizzle,
462 * emit result to a temp, and do new swizzle from the temp.
463 */
464 if (src.v_swz != SWIZZLE_XYZ || src.s_swz != SWIZZLE_W) {
465 pfs_reg_t temp = get_temp_reg(rp);
466 emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_XYZW, src, pfs_one,
467 pfs_zero, 0);
468 src = temp;
469 }
470 src.s_swz = GET_SWZ(arbswz, 3);
471
472 do {
473 do {
474 #define CUR_HASH (v_swiz[src.v_swz].hash & s_mask[c_mask].hash)
475 if (CUR_HASH == (arbswz & s_mask[c_mask].hash)) {
476 if (s_mask[c_mask].count == 3)
477 v_matched += swz_native(rp, src, &r,
478 arbneg);
479 else
480 v_matched += swz_emit_partial(rp, src,
481 &r,
482 c_mask,
483 v_matched,
484 arbneg);
485
486 if (v_matched == 3)
487 return r;
488
489 /* Fill with something invalid.. all 0's was
490 * wrong before, matched SWIZZLE_X. So all
491 * 1's will be okay for now */
492 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
493 }
494 } while(v_swiz[++src.v_swz].hash != PFS_INVAL);
495 src.v_swz = SWIZZLE_XYZ;
496 } while (s_mask[++c_mask].hash != PFS_INVAL);
497
498 ERROR("should NEVER get here\n");
499 return r;
500 }
501
502 static pfs_reg_t t_src(struct r300_fragment_program *rp,
503 struct prog_src_register fpsrc)
504 {
505 pfs_reg_t r = undef;
506 #if 0
507 pfs_reg_t n = undef;
508 #endif
509
510 switch (fpsrc.File) {
511 case PROGRAM_TEMPORARY:
512 r.index = fpsrc.Index;
513 r.valid = GL_TRUE;
514 break;
515 case PROGRAM_INPUT:
516 r.index = fpsrc.Index;
517 r.type = REG_TYPE_INPUT;
518 r.valid = GL_TRUE;
519 break;
520 case PROGRAM_LOCAL_PARAM:
521 r = emit_param4fv(rp,
522 rp->mesa_program.Base.LocalParams[fpsrc.Index]);
523 break;
524 case PROGRAM_ENV_PARAM:
525 r = emit_param4fv(rp,
526 rp->ctx->FragmentProgram.Parameters[fpsrc.Index]);
527 break;
528 case PROGRAM_STATE_VAR:
529 case PROGRAM_NAMED_PARAM:
530 r = emit_param4fv(rp,
531 rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]);
532 break;
533 default:
534 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
535 return r;
536 }
537
538 /* no point swizzling ONE/ZERO/HALF constants... */
539 if (r.v_swz < SWIZZLE_111 || r.s_swz < SWIZZLE_ZERO)
540 r = do_swizzle(rp, r, fpsrc.Swizzle, fpsrc.NegateBase);
541 #if 0
542 /* WRONG! Need to be able to do individual component negation,
543 * should probably handle this in the swizzling code unless
544 * all components are negated, then we can do this natively */
545 if ((fpsrc.NegateBase & 0xf) == 0xf)
546 r.negate = GL_TRUE;
547
548 r.negate_s = (fpsrc.NegateBase >> 3) & 1;
549
550 if ((fpsrc.NegateBase & 0x7) == 0x0) {
551 r.negate_v = 0;
552 } else if ((fpsrc.NegateBase & 0x7) == 0x7) {
553 r.negate_v = 1;
554 } else {
555 if (r.type != REG_TYPE_TEMP) {
556 n = get_temp_reg(rp);
557 emit_arith(rp, PFS_OP_MAD, n, 0x7 ^ fpsrc.NegateBase,
558 keep(r), pfs_one, pfs_zero, 0);
559 r.negate_v = 1;
560 emit_arith(rp, PFS_OP_MAD, n,
561 fpsrc.NegateBase & 0x7 | WRITEMASK_W,
562 r, pfs_one, pfs_zero, 0);
563 r.negate_v = 0;
564 r = n;
565 } else {
566 r.negate_v = 1;
567 emit_arith(rp, PFS_OP_MAD, r,
568 fpsrc.NegateBase & 0x7 | WRITEMASK_W,
569 r, pfs_one, pfs_zero, 0);
570 r.negate_v = 0;
571 }
572 }
573 #endif
574
575 return r;
576 }
577
578 static pfs_reg_t t_scalar_src(struct r300_fragment_program *rp,
579 struct prog_src_register fpsrc)
580 {
581 struct prog_src_register src = fpsrc;
582 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
583
584 src.Swizzle = ((sc<<0)|(sc<<3)|(sc<<6)|(sc<<9));
585
586 return t_src(rp, src);
587 }
588
589 static pfs_reg_t t_dst(struct r300_fragment_program *rp,
590 struct prog_dst_register dest) {
591 pfs_reg_t r = undef;
592
593 switch (dest.File) {
594 case PROGRAM_TEMPORARY:
595 r.index = dest.Index;
596 r.valid = GL_TRUE;
597 return r;
598 case PROGRAM_OUTPUT:
599 r.type = REG_TYPE_OUTPUT;
600 switch (dest.Index) {
601 case FRAG_RESULT_COLR:
602 case FRAG_RESULT_DEPR:
603 r.index = dest.Index;
604 r.valid = GL_TRUE;
605 return r;
606 default:
607 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
608 return r;
609 }
610 default:
611 ERROR("Bad DstReg->File 0x%x\n", dest.File);
612 return r;
613 }
614 }
615
616 static int t_hw_src(struct r300_fragment_program *rp, pfs_reg_t src,
617 GLboolean tex)
618 {
619 COMPILE_STATE;
620 int idx;
621
622 switch (src.type) {
623 case REG_TYPE_TEMP:
624 /* NOTE: if reg==-1 here, a source is being read that
625 * hasn't been written to. Undefined results */
626 if (cs->temps[src.index].reg == -1)
627 cs->temps[src.index].reg = get_hw_temp(rp);
628 idx = cs->temps[src.index].reg;
629
630 if (!src.no_use && (--cs->temps[src.index].refcount == 0))
631 free_temp(rp, src);
632 break;
633 case REG_TYPE_INPUT:
634 idx = cs->inputs[src.index].reg;
635
636 if (!src.no_use && (--cs->inputs[src.index].refcount == 0))
637 free_hw_temp(rp, cs->inputs[src.index].reg);
638 break;
639 case REG_TYPE_CONST:
640 return (src.index | SRC_CONST);
641 default:
642 ERROR("Invalid type for source reg\n");
643 return (0 | SRC_CONST);
644 }
645
646 if (!tex) cs->used_in_node |= (1 << idx);
647
648 return idx;
649 }
650
651 static int t_hw_dst(struct r300_fragment_program *rp, pfs_reg_t dest,
652 GLboolean tex)
653 {
654 COMPILE_STATE;
655 int idx;
656 assert(dest.valid);
657
658 switch (dest.type) {
659 case REG_TYPE_TEMP:
660 if (cs->temps[dest.index].reg == -1) {
661 if (!tex)
662 cs->temps[dest.index].reg = get_hw_temp(rp);
663 else
664 cs->temps[dest.index].reg = get_hw_temp_tex(rp);
665 }
666 idx = cs->temps[dest.index].reg;
667
668 if (!dest.no_use && (--cs->temps[dest.index].refcount == 0))
669 free_temp(rp, dest);
670
671 cs->dest_in_node |= (1 << idx);
672 cs->used_in_node |= (1 << idx);
673 break;
674 case REG_TYPE_OUTPUT:
675 switch (dest.index) {
676 case FRAG_RESULT_COLR:
677 rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_COLOR;
678 break;
679 case FRAG_RESULT_DEPR:
680 rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_DEPTH;
681 break;
682 }
683 return dest.index;
684 break;
685 default:
686 ERROR("invalid dest reg type %d\n", dest.type);
687 return 0;
688 }
689
690 return idx;
691 }
692
693 static void emit_nop(struct r300_fragment_program *rp, GLuint mask,
694 GLboolean sync)
695 {
696 COMPILE_STATE;
697
698 if (sync)
699 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
700
701 if (mask & WRITEMASK_XYZ) {
702 rp->alu.inst[cs->v_pos].inst0 = NOP_INST0;
703 rp->alu.inst[cs->v_pos].inst1 = NOP_INST1;
704 cs->v_pos++;
705 }
706
707 if (mask & WRITEMASK_W) {
708 rp->alu.inst[cs->s_pos].inst2 = NOP_INST2;
709 rp->alu.inst[cs->s_pos].inst3 = NOP_INST3;
710 cs->s_pos++;
711 }
712 }
713
714 static void emit_tex(struct r300_fragment_program *rp,
715 struct prog_instruction *fpi,
716 int opcode)
717 {
718 COMPILE_STATE;
719 pfs_reg_t coord = t_src(rp, fpi->SrcReg[0]);
720 pfs_reg_t dest = undef, rdest = undef;
721 GLuint din = cs->dest_in_node, uin = cs->used_in_node;
722 int unit = fpi->TexSrcUnit;
723 int hwsrc, hwdest;
724
725 /* Resolve source/dest to hardware registers */
726 hwsrc = t_hw_src(rp, coord, GL_TRUE);
727 if (opcode != R300_FPITX_OP_KIL) {
728 dest = t_dst(rp, fpi->DstReg);
729
730 /* r300 doesn't seem to be able to do TEX->output reg */
731 if (dest.type == REG_TYPE_OUTPUT) {
732 rdest = dest;
733 dest = get_temp_reg_tex(rp);
734 }
735 hwdest = t_hw_dst(rp, dest, GL_TRUE);
736
737 /* Use a temp that hasn't been used in this node, rather
738 * than causing an indirection
739 */
740 if (uin & (1 << hwdest)) {
741 free_hw_temp(rp, hwdest);
742 hwdest = get_hw_temp_tex(rp);
743 cs->temps[dest.index].reg = hwdest;
744 }
745 } else {
746 hwdest = 0;
747 unit = 0;
748 }
749
750 /* Indirection if source has been written in this node, or if the
751 * dest has been read/written in this node
752 */
753 if ((coord.type != REG_TYPE_CONST && (din & (1<<hwsrc))) ||
754 (uin & (1<<hwdest))) {
755
756 /* Finish off current node */
757 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
758 if (rp->node[rp->cur_node].alu_offset == cs->v_pos) {
759 /* No alu instructions in the node? Emit a NOP. */
760 emit_nop(rp, WRITEMASK_XYZW, GL_TRUE);
761 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
762 }
763
764 rp->node[rp->cur_node].alu_end =
765 cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
766 assert(rp->node[rp->cur_node].alu_end >= 0);
767
768 if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
769 ERROR("too many levels of texture indirection\n");
770 return;
771 }
772
773 /* Start new node */
774 rp->node[rp->cur_node].tex_offset = rp->tex.length;
775 rp->node[rp->cur_node].alu_offset = cs->v_pos;
776 rp->node[rp->cur_node].tex_end = -1;
777 rp->node[rp->cur_node].alu_end = -1;
778 rp->node[rp->cur_node].flags = 0;
779 cs->used_in_node = 0;
780 cs->dest_in_node = 0;
781 }
782
783 if (rp->cur_node == 0)
784 rp->first_node_has_tex = 1;
785
786 rp->tex.inst[rp->tex.length++] = 0
787 | (hwsrc << R300_FPITX_SRC_SHIFT)
788 | (hwdest << R300_FPITX_DST_SHIFT)
789 | (unit << R300_FPITX_IMAGE_SHIFT)
790 /* not entirely sure about this */
791 | (opcode << R300_FPITX_OPCODE_SHIFT);
792
793 cs->dest_in_node |= (1 << hwdest);
794 if (coord.type != REG_TYPE_CONST)
795 cs->used_in_node |= (1 << hwsrc);
796
797 rp->node[rp->cur_node].tex_end++;
798
799 /* Copy from temp to output if needed */
800 if (rdest.valid) {
801 emit_arith(rp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
802 pfs_one, pfs_zero, 0);
803 free_temp(rp, dest);
804 }
805 }
806
807 /* Add sources to FPI1/FPI3 lists. If source is already on list,
808 * reuse the index instead of wasting a source.
809 */
810 static int add_src(struct r300_fragment_program *rp, int reg, int pos,
811 int srcmask)
812 {
813 COMPILE_STATE;
814 int csm, i;
815
816 /* Look for matches */
817 for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
818 /* If sources have been allocated in this position(s)... */
819 if ((cs->slot[pos].umask & csm) == csm) {
820 /* ... and the register number(s) match, re-use the
821 source */
822 if (srcmask == SLOT_VECTOR &&
823 cs->slot[pos].vsrc[i] == reg)
824 return i;
825 if (srcmask == SLOT_SCALAR &&
826 cs->slot[pos].ssrc[i] == reg)
827 return i;
828 if (srcmask == SLOT_BOTH &&
829 cs->slot[pos].vsrc[i] == reg &&
830 cs->slot[pos].ssrc[i] == reg)
831 return i;
832 }
833 }
834
835 /* Look for free spaces */
836 for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
837 /* If the position(s) haven't been allocated */
838 if ((cs->slot[pos].umask & csm) == 0) {
839 cs->slot[pos].umask |= csm;
840
841 if (srcmask & SLOT_VECTOR)
842 cs->slot[pos].vsrc[i] = reg;
843 if (srcmask & SLOT_SCALAR)
844 cs->slot[pos].ssrc[i] = reg;
845 return i;
846 }
847 }
848
849 //ERROR("Failed to allocate sources in FPI1/FPI3!\n");
850 return 0;
851 }
852
853 /* Determine whether or not to position opcode in the same ALU slot for both
854 * vector and scalar portions of an instruction.
855 *
856 * It's not necessary to force the first case, but it makes disassembled
857 * shaders easier to read.
858 */
859 static GLboolean force_same_slot(int vop, int sop,
860 GLboolean emit_vop, GLboolean emit_sop,
861 int argc, pfs_reg_t *src)
862 {
863 int i;
864
865 if (emit_vop && emit_sop)
866 return GL_TRUE;
867
868 if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA)
869 return GL_TRUE;
870
871 if (emit_vop) {
872 for (i=0;i<argc;i++)
873 if (src[i].v_swz == SWIZZLE_WZY)
874 return GL_TRUE;
875 }
876
877 return GL_FALSE;
878 }
879
880 static void emit_arith(struct r300_fragment_program *rp, int op,
881 pfs_reg_t dest, int mask,
882 pfs_reg_t src0, pfs_reg_t src1, pfs_reg_t src2,
883 int flags)
884 {
885 COMPILE_STATE;
886 pfs_reg_t src[3] = { src0, src1, src2 };
887 int hwsrc[3], sswz[3], vswz[3];
888 int hwdest;
889 GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE;
890 int vop, sop, argc;
891 int vpos, spos;
892 int i;
893
894 vop = r300_fpop[op].v_op;
895 sop = r300_fpop[op].s_op;
896 argc = r300_fpop[op].argc;
897
898 if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
899 emit_vop = GL_TRUE;
900 if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
901 emit_sop = GL_TRUE;
902
903 if (dest.type == REG_TYPE_OUTPUT && dest.index == FRAG_RESULT_DEPR)
904 emit_vop = GL_FALSE;
905
906 if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) {
907 vpos = spos = MAX2(cs->v_pos, cs->s_pos);
908 } else {
909 vpos = cs->v_pos;
910 spos = cs->s_pos;
911 /* Here is where we'd decide on where a safe place is to
912 * combine this instruction with a previous one.
913 *
914 * This is extremely simple for now.. if a source depends
915 * on the opposite stream, force the same instruction.
916 */
917 for (i=0;i<3;i++) {
918 if (emit_vop &&
919 (v_swiz[src[i].v_swz].flags & SLOT_SCALAR)) {
920 vpos = spos = MAX2(vpos, spos);
921 break;
922 }
923 if (emit_sop &&
924 (s_swiz[src[i].s_swz].flags & SLOT_VECTOR)) {
925 vpos = spos = MAX2(vpos, spos);
926 break;
927 }
928 }
929 }
930
931 /* - Convert src->hwsrc, record for FPI1/FPI3
932 * - Determine ARG parts of FPI0/FPI2, unused args are filled
933 * with ARG_ZERO.
934 */
935 for (i=0;i<3;i++) {
936 int srcpos;
937
938 if (i >= argc) {
939 vswz[i] = R300_FPI0_ARGC_ZERO;
940 sswz[i] = R300_FPI2_ARGA_ZERO;
941 continue;
942 }
943
944 hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);
945
946 if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) {
947 srcpos = add_src(rp, hwsrc[i], vpos,
948 v_swiz[src[i].v_swz].flags);
949 vswz[i] = (v_swiz[src[i].v_swz].base +
950 (srcpos * v_swiz[src[i].v_swz].stride)) |
951 (src[i].negate_v ? ARG_NEG : 0) |
952 (src[i].absolute ? ARG_ABS : 0);
953 } else vswz[i] = R300_FPI0_ARGC_ZERO;
954
955 if (emit_sop) {
956 srcpos = add_src(rp, hwsrc[i], spos,
957 s_swiz[src[i].s_swz].flags);
958 sswz[i] = (s_swiz[src[i].s_swz].base +
959 (srcpos * s_swiz[src[i].s_swz].stride)) |
960 (src[i].negate_s ? ARG_NEG : 0) |
961 (src[i].absolute ? ARG_ABS : 0);
962 } else sswz[i] = R300_FPI2_ARGA_ZERO;
963 }
964 hwdest = t_hw_dst(rp, dest, GL_FALSE);
965
966 if (flags & PFS_FLAG_SAT) {
967 vop |= R300_FPI0_OUTC_SAT;
968 sop |= R300_FPI2_OUTA_SAT;
969 }
970
971 /* Throw the pieces together and get FPI0/1 */
972 rp->alu.inst[vpos].inst1 =
973 ((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
974 (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
975 (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
976 if (emit_vop) {
977 rp->alu.inst[vpos].inst0 = vop |
978 (vswz[0] << R300_FPI0_ARG0C_SHIFT) |
979 (vswz[1] << R300_FPI0_ARG1C_SHIFT) |
980 (vswz[2] << R300_FPI0_ARG2C_SHIFT);
981
982 rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
983 if (dest.type == REG_TYPE_OUTPUT) {
984 if (dest.index == FRAG_RESULT_COLR) {
985 rp->alu.inst[vpos].inst1 |=
986 (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
987 } else assert(0);
988 } else {
989 rp->alu.inst[vpos].inst1 |=
990 (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
991 }
992 cs->v_pos = vpos+1;
993 } else if (spos >= vpos)
994 rp->alu.inst[spos].inst0 = NOP_INST0;
995
996 /* And now FPI2/3 */
997 rp->alu.inst[spos].inst3 =
998 ((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
999 (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1000 (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1001 if (emit_sop) {
1002 rp->alu.inst[spos].inst2 = sop |
1003 sswz[0] << R300_FPI2_ARG0A_SHIFT |
1004 sswz[1] << R300_FPI2_ARG1A_SHIFT |
1005 sswz[2] << R300_FPI2_ARG2A_SHIFT;
1006
1007 if (mask & WRITEMASK_W) {
1008 if (dest.type == REG_TYPE_OUTPUT) {
1009 if (dest.index == FRAG_RESULT_COLR) {
1010 rp->alu.inst[spos].inst3 |=
1011 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
1012 } else if (dest.index == FRAG_RESULT_DEPR) {
1013 rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH;
1014 } else assert(0);
1015 } else {
1016 rp->alu.inst[spos].inst3 |=
1017 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
1018 }
1019 }
1020 cs->s_pos = spos+1;
1021 } else if (vpos >= spos)
1022 rp->alu.inst[vpos].inst2 = NOP_INST2;
1023
1024 return;
1025 };
1026
1027 #if 0
1028 static pfs_reg_t get_attrib(struct r300_fragment_program *rp, GLuint attr)
1029 {
1030 struct gl_fragment_program *mp = &rp->mesa_program;
1031 pfs_reg_t r = undef;
1032
1033 if (!(mp->Base.InputsRead & (1<<attr))) {
1034 ERROR("Attribute %d was not provided!\n", attr);
1035 return undef;
1036 }
1037
1038 r.type = REG_TYPE_INPUT;
1039 r.index = attr;
1040 r.valid = GL_TRUE;
1041 return r;
1042 }
1043 #endif
1044
1045 static GLboolean parse_program(struct r300_fragment_program *rp)
1046 {
1047 struct gl_fragment_program *mp = &rp->mesa_program;
1048 const struct prog_instruction *inst = mp->Base.Instructions;
1049 struct prog_instruction *fpi;
1050 pfs_reg_t src[3], dest, temp;
1051 pfs_reg_t cnst;
1052 int flags, mask = 0;
1053 GLfloat cnstv[4] = {0.0, 0.0, 0.0, 0.0};
1054
1055 if (!inst || inst[0].Opcode == OPCODE_END) {
1056 ERROR("empty program?\n");
1057 return GL_FALSE;
1058 }
1059
1060 for (fpi=mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1061 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1062 flags = PFS_FLAG_SAT;
1063 else
1064 flags = 0;
1065
1066 if (fpi->Opcode != OPCODE_KIL) {
1067 dest = t_dst(rp, fpi->DstReg);
1068 mask = fpi->DstReg.WriteMask;
1069 }
1070
1071 switch (fpi->Opcode) {
1072 case OPCODE_ABS:
1073 src[0] = t_src(rp, fpi->SrcReg[0]);
1074 emit_arith(rp, PFS_OP_MAD, dest, mask,
1075 absolute(src[0]), pfs_one, pfs_zero,
1076 flags);
1077 break;
1078 case OPCODE_ADD:
1079 src[0] = t_src(rp, fpi->SrcReg[0]);
1080 src[1] = t_src(rp, fpi->SrcReg[1]);
1081 emit_arith(rp, PFS_OP_MAD, dest, mask,
1082 src[0], pfs_one, src[1],
1083 flags);
1084 break;
1085 case OPCODE_CMP:
1086 src[0] = t_src(rp, fpi->SrcReg[0]);
1087 src[1] = t_src(rp, fpi->SrcReg[1]);
1088 src[2] = t_src(rp, fpi->SrcReg[2]);
1089 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1090 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1091 */
1092 emit_arith(rp, PFS_OP_CMP, dest, mask,
1093 src[2], src[1], src[0],
1094 flags);
1095 break;
1096 case OPCODE_COS:
1097 ERROR("COS not implemented\n");
1098 break;
1099 case OPCODE_DP3:
1100 src[0] = t_src(rp, fpi->SrcReg[0]);
1101 src[1] = t_src(rp, fpi->SrcReg[1]);
1102 emit_arith(rp, PFS_OP_DP3, dest, mask,
1103 src[0], src[1], undef,
1104 flags);
1105 break;
1106 case OPCODE_DP4:
1107 src[0] = t_src(rp, fpi->SrcReg[0]);
1108 src[1] = t_src(rp, fpi->SrcReg[1]);
1109 emit_arith(rp, PFS_OP_DP4, dest, mask,
1110 src[0], src[1], undef,
1111 flags);
1112 break;
1113 case OPCODE_DPH:
1114 src[0] = t_src(rp, fpi->SrcReg[0]);
1115 src[1] = t_src(rp, fpi->SrcReg[1]);
1116 /* src0.xyz1 -> temp
1117 * DP4 dest, temp, src1
1118 */
1119 #if 0
1120 temp = get_temp_reg(rp);
1121 src[0].s_swz = SWIZZLE_ONE;
1122 emit_arith(rp, PFS_OP_MAD, temp, mask,
1123 src[0], pfs_one, pfs_zero,
1124 0);
1125 emit_arith(rp, PFS_OP_DP4, dest, mask,
1126 temp, src[1], undef,
1127 flags);
1128 free_temp(rp, temp);
1129 #else
1130 emit_arith(rp, PFS_OP_DP4, dest, mask,
1131 swizzle(src[0], X, Y, Z, ONE), src[1],
1132 undef, flags);
1133 #endif
1134 break;
1135 case OPCODE_DST:
1136 src[0] = t_src(rp, fpi->SrcReg[0]);
1137 src[1] = t_src(rp, fpi->SrcReg[1]);
1138 /* dest.y = src0.y * src1.y */
1139 if (mask & WRITEMASK_Y)
1140 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1141 keep(src[0]), keep(src[1]),
1142 pfs_zero, flags);
1143 /* dest.z = src0.z */
1144 if (mask & WRITEMASK_Z)
1145 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Z,
1146 src[0], pfs_one, pfs_zero, flags);
1147 /* result.x = 1.0
1148 * result.w = src1.w */
1149 if (mask & WRITEMASK_XW) {
1150 src[1].v_swz = SWIZZLE_111; /* Cheat.. */
1151 emit_arith(rp, PFS_OP_MAD, dest,
1152 mask & WRITEMASK_XW,
1153 src[1], pfs_one, pfs_zero,
1154 flags);
1155 }
1156 break;
1157 case OPCODE_EX2:
1158 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1159 emit_arith(rp, PFS_OP_EX2, dest, mask,
1160 src[0], undef, undef,
1161 flags);
1162 break;
1163 case OPCODE_FLR:
1164 src[0] = t_src(rp, fpi->SrcReg[0]);
1165 temp = get_temp_reg(rp);
1166 /* FRC temp, src0
1167 * MAD dest, src0, 1.0, -temp
1168 */
1169 emit_arith(rp, PFS_OP_FRC, temp, mask,
1170 keep(src[0]), undef, undef,
1171 0);
1172 emit_arith(rp, PFS_OP_MAD, dest, mask,
1173 src[0], pfs_one, negate(temp),
1174 flags);
1175 free_temp(rp, temp);
1176 break;
1177 case OPCODE_FRC:
1178 src[0] = t_src(rp, fpi->SrcReg[0]);
1179 emit_arith(rp, PFS_OP_FRC, dest, mask,
1180 src[0], undef, undef,
1181 flags);
1182 break;
1183 case OPCODE_KIL:
1184 emit_tex(rp, fpi, R300_FPITX_OP_KIL);
1185 break;
1186 case OPCODE_LG2:
1187 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1188 emit_arith(rp, PFS_OP_LG2, dest, mask,
1189 src[0], undef, undef,
1190 flags);
1191 break;
1192 case OPCODE_LIT:
1193 /* LIT
1194 * if (s.x < 0) t.x = 0; else t.x = s.x;
1195 * if (s.y < 0) t.y = 0; else t.y = s.y;
1196 * if (s.w > 128.0) t.w = 128.0; else t.w = s.w;
1197 * if (s.w < -128.0) t.w = -128.0; else t.w = s.w;
1198 * r.x = 1.0
1199 * if (t.x > 0) r.y = pow(t.y, t.w); else r.y = 0;
1200 * Also r.y = 0 if t.y < 0
1201 * For the t.x > 0 FGLRX use the CMPH opcode which
1202 * change the compare to (t.x + 0.5) > 0.5 we may
1203 * save one instruction by doing CMP -t.x
1204 */
1205 cnstv[0] = cnstv[1] = cnstv[2] = cnstv[4] = 0.50001;
1206 src[0] = t_src(rp, fpi->SrcReg[0]);
1207 temp = get_temp_reg(rp);
1208 cnst = emit_const4fv(rp, cnstv);
1209 emit_arith(rp, PFS_OP_CMP, temp,
1210 WRITEMASK_X | WRITEMASK_Y,
1211 src[0], pfs_zero, src[0], flags);
1212 emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
1213 swizzle(keep(src[0]), W, W, W, W),
1214 cnst, undef, flags);
1215 emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
1216 swizzle(temp, Y, Y, Y, Y),
1217 undef, undef, flags);
1218 emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_Z,
1219 temp, negate(cnst), undef, flags);
1220 emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1221 temp, swizzle(temp, Z, Z, Z, Z),
1222 pfs_zero, flags);
1223 emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W,
1224 temp, undef, undef, flags);
1225 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1226 swizzle(keep(temp), X, X, X, X),
1227 pfs_one, pfs_zero, flags);
1228 #if 0
1229 emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
1230 temp, pfs_one, pfs_half, flags);
1231 emit_arith(rp, PFS_OP_CMPH, temp, WRITEMASK_Z,
1232 swizzle(keep(temp), W, W, W, W),
1233 pfs_zero, swizzle(keep(temp), X, X, X, X),
1234 flags);
1235 #else
1236 emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
1237 pfs_zero,
1238 swizzle(keep(temp), W, W, W, W),
1239 negate(swizzle(keep(temp), X, X, X, X)),
1240 flags);
1241 #endif
1242 emit_arith(rp, PFS_OP_CMP, dest, WRITEMASK_Z,
1243 pfs_zero, temp,
1244 negate(swizzle(keep(temp), Y, Y, Y, Y)),
1245 flags);
1246 emit_arith(rp, PFS_OP_MAD, dest,
1247 WRITEMASK_X | WRITEMASK_W,
1248 pfs_one,
1249 pfs_one,
1250 pfs_zero,
1251 flags);
1252 free_temp(rp, temp);
1253 break;
1254 case OPCODE_LRP:
1255 src[0] = t_src(rp, fpi->SrcReg[0]);
1256 src[1] = t_src(rp, fpi->SrcReg[1]);
1257 src[2] = t_src(rp, fpi->SrcReg[2]);
1258 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1259 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1260 * MAD temp, -tmp0, tmp2, tmp2
1261 * MAD result, tmp0, tmp1, temp
1262 */
1263 temp = get_temp_reg(rp);
1264 emit_arith(rp, PFS_OP_MAD, temp, mask,
1265 negate(keep(src[0])), keep(src[2]), src[2],
1266 0);
1267 emit_arith(rp, PFS_OP_MAD, dest, mask,
1268 src[0], src[1], temp,
1269 flags);
1270 free_temp(rp, temp);
1271 break;
1272 case OPCODE_MAD:
1273 src[0] = t_src(rp, fpi->SrcReg[0]);
1274 src[1] = t_src(rp, fpi->SrcReg[1]);
1275 src[2] = t_src(rp, fpi->SrcReg[2]);
1276 emit_arith(rp, PFS_OP_MAD, dest, mask,
1277 src[0], src[1], src[2],
1278 flags);
1279 break;
1280 case OPCODE_MAX:
1281 src[0] = t_src(rp, fpi->SrcReg[0]);
1282 src[1] = t_src(rp, fpi->SrcReg[1]);
1283 emit_arith(rp, PFS_OP_MAX, dest, mask,
1284 src[0], src[1], undef,
1285 flags);
1286 break;
1287 case OPCODE_MIN:
1288 src[0] = t_src(rp, fpi->SrcReg[0]);
1289 src[1] = t_src(rp, fpi->SrcReg[1]);
1290 emit_arith(rp, PFS_OP_MIN, dest, mask,
1291 src[0], src[1], undef,
1292 flags);
1293 break;
1294 case OPCODE_MOV:
1295 case OPCODE_SWZ:
1296 src[0] = t_src(rp, fpi->SrcReg[0]);
1297 emit_arith(rp, PFS_OP_MAD, dest, mask,
1298 src[0], pfs_one, pfs_zero,
1299 flags);
1300 break;
1301 case OPCODE_MUL:
1302 src[0] = t_src(rp, fpi->SrcReg[0]);
1303 src[1] = t_src(rp, fpi->SrcReg[1]);
1304 emit_arith(rp, PFS_OP_MAD, dest, mask,
1305 src[0], src[1], pfs_zero,
1306 flags);
1307 break;
1308 case OPCODE_POW:
1309 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1310 src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
1311 temp = get_temp_reg(rp);
1312 emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
1313 src[0], undef, undef,
1314 0);
1315 emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1316 temp, src[1], pfs_zero,
1317 0);
1318 emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1319 temp, undef, undef,
1320 0);
1321 free_temp(rp, temp);
1322 break;
1323 case OPCODE_RCP:
1324 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1325 emit_arith(rp, PFS_OP_RCP, dest, mask,
1326 src[0], undef, undef,
1327 flags);
1328 break;
1329 case OPCODE_RSQ:
1330 src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1331 emit_arith(rp, PFS_OP_RSQ, dest, mask,
1332 absolute(src[0]), pfs_zero, pfs_zero,
1333 flags);
1334 break;
1335 case OPCODE_SCS:
1336 ERROR("SCS not implemented\n");
1337 break;
1338 case OPCODE_SGE:
1339 src[0] = t_src(rp, fpi->SrcReg[0]);
1340 src[1] = t_src(rp, fpi->SrcReg[1]);
1341 temp = get_temp_reg(rp);
1342 /* temp = src0 - src1
1343 * dest.c = (temp.c < 0.0) ? 0 : 1
1344 */
1345 emit_arith(rp, PFS_OP_MAD, temp, mask,
1346 src[0], pfs_one, negate(src[1]),
1347 0);
1348 emit_arith(rp, PFS_OP_CMP, dest, mask,
1349 pfs_one, pfs_zero, temp,
1350 0);
1351 free_temp(rp, temp);
1352 break;
1353 case OPCODE_SIN:
1354 ERROR("SIN not implemented\n");
1355 break;
1356 case OPCODE_SLT:
1357 src[0] = t_src(rp, fpi->SrcReg[0]);
1358 src[1] = t_src(rp, fpi->SrcReg[1]);
1359 temp = get_temp_reg(rp);
1360 /* temp = src0 - src1
1361 * dest.c = (temp.c < 0.0) ? 1 : 0
1362 */
1363 emit_arith(rp, PFS_OP_MAD, temp, mask,
1364 src[0], pfs_one, negate(src[1]),
1365 0);
1366 emit_arith(rp, PFS_OP_CMP, dest, mask,
1367 pfs_zero, pfs_one, temp,
1368 0);
1369 free_temp(rp, temp);
1370 break;
1371 case OPCODE_SUB:
1372 src[0] = t_src(rp, fpi->SrcReg[0]);
1373 src[1] = t_src(rp, fpi->SrcReg[1]);
1374 emit_arith(rp, PFS_OP_MAD, dest, mask,
1375 src[0], pfs_one, negate(src[1]),
1376 flags);
1377 break;
1378 case OPCODE_TEX:
1379 emit_tex(rp, fpi, R300_FPITX_OP_TEX);
1380 break;
1381 case OPCODE_TXB:
1382 emit_tex(rp, fpi, R300_FPITX_OP_TXB);
1383 break;
1384 case OPCODE_TXP:
1385 emit_tex(rp, fpi, R300_FPITX_OP_TXP);
1386 break;
1387 case OPCODE_XPD: {
1388 src[0] = t_src(rp, fpi->SrcReg[0]);
1389 src[1] = t_src(rp, fpi->SrcReg[1]);
1390 temp = get_temp_reg(rp);
1391 /* temp = src0.zxy * src1.yzx */
1392 emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_XYZ,
1393 swizzle(keep(src[0]), Z, X, Y, W),
1394 swizzle(keep(src[1]), Y, Z, X, W),
1395 pfs_zero,
1396 0);
1397 /* dest.xyz = src0.yzx * src1.zxy - temp
1398 * dest.w = undefined
1399 * */
1400 emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ,
1401 swizzle(src[0], Y, Z, X, W),
1402 swizzle(src[1], Z, X, Y, W),
1403 negate(temp),
1404 flags);
1405 /* cleanup */
1406 free_temp(rp, temp);
1407 break;
1408 }
1409 default:
1410 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1411 break;
1412 }
1413
1414 if (rp->error)
1415 return GL_FALSE;
1416
1417 }
1418
1419 return GL_TRUE;
1420 }
1421
1422 /* - Init structures
1423 * - Determine what hwregs each input corresponds to
1424 */
1425 static void init_program(struct r300_fragment_program *rp)
1426 {
1427 struct r300_pfs_compile_state *cs = NULL;
1428 struct gl_fragment_program *mp = &rp->mesa_program;
1429 struct prog_instruction *fpi;
1430 GLuint InputsRead = mp->Base.InputsRead;
1431 GLuint temps_used = 0; /* for rp->temps[] */
1432 int i,j;
1433
1434 /* New compile, reset tracking data */
1435 rp->translated = GL_FALSE;
1436 rp->error = GL_FALSE;
1437 rp->cs = cs = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
1438 rp->tex.length = 0;
1439 rp->cur_node = 0;
1440 rp->first_node_has_tex = 0;
1441 rp->const_nr = 0;
1442 rp->param_nr = 0;
1443 rp->params_uptodate = GL_FALSE;
1444 rp->max_temp_idx = 0;
1445 rp->node[0].alu_end = -1;
1446 rp->node[0].tex_end = -1;
1447
1448 _mesa_memset(cs, 0, sizeof(*rp->cs));
1449 for (i=0;i<PFS_MAX_ALU_INST;i++) {
1450 for (j=0;j<3;j++) {
1451 cs->slot[i].vsrc[j] = SRC_CONST;
1452 cs->slot[i].ssrc[j] = SRC_CONST;
1453 }
1454 }
1455
1456 /* Work out what temps the Mesa inputs correspond to, this must match
1457 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1458 * configures itself based on the fragprog's InputsRead
1459 *
1460 * NOTE: this depends on get_hw_temp() allocating registers in order,
1461 * starting from register 0.
1462 */
1463
1464 /* Texcoords come first */
1465 for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
1466 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
1467 cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
1468 cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp);
1469 }
1470 }
1471 InputsRead &= ~FRAG_BITS_TEX_ANY;
1472
1473 /* Then primary colour */
1474 if (InputsRead & FRAG_BIT_COL0) {
1475 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
1476 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp);
1477 }
1478 InputsRead &= ~FRAG_BIT_COL0;
1479
1480 /* Secondary color */
1481 if (InputsRead & FRAG_BIT_COL1) {
1482 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
1483 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp);
1484 }
1485 InputsRead &= ~FRAG_BIT_COL1;
1486
1487 /* Anything else */
1488 if (InputsRead) {
1489 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
1490 InputsRead);
1491 /* force read from hwreg 0 for now */
1492 for (i=0;i<32;i++)
1493 if (InputsRead & (1<<i)) cs->inputs[i].reg = 0;
1494 }
1495
1496 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
1497 * That way, we can free up the reg when it's no longer needed
1498 */
1499 if (!mp->Base.Instructions) {
1500 ERROR("No instructions found in program\n");
1501 return;
1502 }
1503
1504 for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) {
1505 int idx;
1506
1507 for (i=0;i<3;i++) {
1508 idx = fpi->SrcReg[i].Index;
1509 switch (fpi->SrcReg[i].File) {
1510 case PROGRAM_TEMPORARY:
1511 if (!(temps_used & (1<<idx))) {
1512 cs->temps[idx].reg = -1;
1513 cs->temps[idx].refcount = 1;
1514 temps_used |= (1 << idx);
1515 } else
1516 cs->temps[idx].refcount++;
1517 break;
1518 case PROGRAM_INPUT:
1519 cs->inputs[idx].refcount++;
1520 break;
1521 default: break;
1522 }
1523 }
1524
1525 idx = fpi->DstReg.Index;
1526 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
1527 if (!(temps_used & (1<<idx))) {
1528 cs->temps[idx].reg = -1;
1529 cs->temps[idx].refcount = 1;
1530 temps_used |= (1 << idx);
1531 } else
1532 cs->temps[idx].refcount++;
1533 }
1534 }
1535 cs->temp_in_use = temps_used;
1536 }
1537
1538 static void update_params(struct r300_fragment_program *rp)
1539 {
1540 struct gl_fragment_program *mp = &rp->mesa_program;
1541 int i;
1542
1543 /* Ask Mesa nicely to fill in ParameterValues for us */
1544 if (rp->param_nr)
1545 _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
1546
1547 for (i=0;i<rp->param_nr;i++)
1548 COPY_4V(rp->constant[rp->param[i].idx], rp->param[i].values);
1549
1550 rp->params_uptodate = GL_TRUE;
1551 }
1552
1553 void r300_translate_fragment_shader(struct r300_fragment_program *rp)
1554 {
1555 struct r300_pfs_compile_state *cs = NULL;
1556
1557 if (!rp->translated) {
1558
1559 init_program(rp);
1560 cs = rp->cs;
1561
1562 if (parse_program(rp) == GL_FALSE) {
1563 dump_program(rp);
1564 return;
1565 }
1566
1567 /* Finish off */
1568 cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
1569 rp->node[rp->cur_node].alu_end =
1570 cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
1571 if (rp->node[rp->cur_node].tex_end < 0)
1572 rp->node[rp->cur_node].tex_end = 0;
1573 rp->alu_offset = 0;
1574 rp->alu_end = cs->v_pos - 1;
1575 rp->tex_offset = 0;
1576 rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0;
1577 assert(rp->node[rp->cur_node].alu_end >= 0);
1578 assert(rp->alu_end >= 0);
1579
1580 rp->translated = GL_TRUE;
1581 if (0) dump_program(rp);
1582 }
1583
1584 update_params(rp);
1585 }
1586
1587 /* just some random things... */
1588 static void dump_program(struct r300_fragment_program *rp)
1589 {
1590 int i;
1591 static int pc = 0;
1592
1593 fprintf(stderr, "pc=%d*************************************\n", pc++);
1594
1595 fprintf(stderr, "Mesa program:\n");
1596 fprintf(stderr, "-------------\n");
1597 _mesa_print_program(&rp->mesa_program.Base);
1598 fflush(stdout);
1599
1600 fprintf(stderr, "Hardware program\n");
1601 fprintf(stderr, "----------------\n");
1602
1603 fprintf(stderr, "tex:\n");
1604
1605 for(i=0;i<rp->tex.length;i++) {
1606 fprintf(stderr, "%08x\n", rp->tex.inst[i]);
1607 }
1608
1609 for (i=0;i<(rp->cur_node+1);i++) {
1610 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
1611 "alu_end: %d, tex_end: %d\n", i,
1612 rp->node[i].alu_offset,
1613 rp->node[i].tex_offset,
1614 rp->node[i].alu_end,
1615 rp->node[i].tex_end);
1616 }
1617
1618 fprintf(stderr, "%08x\n",
1619 ((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2)));
1620 for (i=0;i<=rp->tex_end;i++)
1621 fprintf(stderr, "%08x\n", rp->tex.inst[i]);
1622
1623 /* dump program in pretty_print_command_stream.tcl-readable format */
1624 fprintf(stderr, "%08x\n",
1625 ((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2)));
1626 for (i=0;i<=rp->alu_end;i++)
1627 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0);
1628
1629 fprintf(stderr, "%08x\n",
1630 ((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2)));
1631 for (i=0;i<=rp->alu_end;i++)
1632 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1);
1633
1634 fprintf(stderr, "%08x\n",
1635 ((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2)));
1636 for (i=0;i<=rp->alu_end;i++)
1637 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2);
1638
1639 fprintf(stderr, "%08x\n",
1640 ((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2)));
1641 for (i=0;i<=rp->alu_end;i++)
1642 fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3);
1643
1644 fprintf(stderr, "00000000\n");
1645 }