r300_fragprog: Remove dead declarations, move NOP declarations into source
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * \file
30 *
31 * \author Ben Skeggs <darktama@iinet.net.au>
32 *
33 * \author Jerome Glisse <j.glisse@gmail.com>
34 *
35 * \todo Depth write, WPOS/FOGC inputs
36 *
37 * \todo FogOption
38 *
39 * \todo Verify results of opcodes for accuracy, I've only checked them in
40 * specific cases.
41 */
42
43 #include "glheader.h"
44 #include "macros.h"
45 #include "enums.h"
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
49
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
52 #include "r300_reg.h"
53 #include "r300_state.h"
54
55 /* Mapping Mesa registers to R300 temporaries */
56 struct reg_acc {
57 int reg; /* Assigned hw temp */
58 unsigned int refcount; /* Number of uses by mesa program */
59 };
60
61 /**
62 * Describe the current lifetime information for an R300 temporary
63 */
64 struct reg_lifetime {
65 /* Index of the first slot where this register is free in the sense
66 that it can be used as a new destination register.
67 This is -1 if the register has been assigned to a Mesa register
68 and the last access to the register has not yet been emitted */
69 int free;
70
71 /* Index of the first slot where this register is currently reserved.
72 This is used to stop e.g. a scalar operation from being moved
73 before the allocation time of a register that was first allocated
74 for a vector operation. */
75 int reserved;
76
77 /* Index of the first slot in which the register can be used as a
78 source without losing the value that is written by the last
79 emitted instruction that writes to the register */
80 int vector_valid;
81 int scalar_valid;
82
83 /* Index to the slot where the register was last read.
84 This is also the first slot in which the register may be written again */
85 int vector_lastread;
86 int scalar_lastread;
87 };
88
89 /**
90 * Store usage information about an ALU instruction slot during the
91 * compilation of a fragment program.
92 */
93 #define SLOT_SRC_VECTOR (1<<0)
94 #define SLOT_SRC_SCALAR (1<<3)
95 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
96 #define SLOT_OP_VECTOR (1<<16)
97 #define SLOT_OP_SCALAR (1<<17)
98 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
99
100 struct r300_pfs_compile_slot {
101 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
102 defined above */
103 unsigned int used;
104
105 /* Selected sources */
106 int vsrc[3];
107 int ssrc[3];
108 };
109
110 /**
111 * Store information during compilation of fragment programs.
112 */
113 struct r300_pfs_compile_state {
114 r300ContextPtr r300;
115 struct r300_fragment_program *fp;
116
117 int nrslots; /* number of ALU slots used so far */
118
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
121
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
124
125 /* Used to map Mesa's inputs/temps onto hardware temps */
126 int temp_in_use;
127 struct reg_acc temps[PFS_NUM_TEMP_REGS];
128 struct reg_acc inputs[32]; /* don't actually need 32... */
129
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
132 GLuint used_in_node;
133 GLuint dest_in_node;
134 };
135
136
137 /*
138 * Usefull macros and values
139 */
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
144 } while(0)
145
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->fp; \
149 struct r300_fragment_program_code *code = &fp->code; \
150 (void)code
151
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
163
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
165 ((SWIZZLE_##x<<0)| \
166 (SWIZZLE_##y<<3)| \
167 (SWIZZLE_##z<<6)| \
168 (SWIZZLE_##w<<9)), \
169 0)
170
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
175
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
186
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
197
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
247
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
266
267
268 /*
269 * Datas structures for fragment program generation
270 */
271
272 /* description of r300 native hw instructions */
273 static const struct {
274 const char *name;
275 int argc;
276 int v_op;
277 int s_op;
278 } r300_fpop[] = {
279 /* *INDENT-OFF* */
280 {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
281 {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
282 {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
283 {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
284 {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
285 {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
286 {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
293 /* *INDENT-ON* */
294 };
295
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
298 *
299 * REG_VSWZ/REG_SSWZ is an index into this table
300 */
301
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
304
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
306 SWIZZLE_##y, \
307 SWIZZLE_##z, \
308 SWIZZLE_ZERO))
309 /* native swizzles */
310 static const struct r300_pfs_swizzle {
311 GLuint hash; /* swizzle value this matches */
312 GLuint base; /* base value for hw swizzle */
313 GLuint stride; /* difference in base between arg0/1/2 */
314 GLuint flags;
315 } v_swiz[] = {
316 /* *INDENT-OFF* */
317 {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
318 {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
319 {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
320 {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
321 {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
322 {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
323 {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
324 {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
325 {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
326 {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
327 {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
328 {PFS_INVAL, 0, 0, 0},
329 /* *INDENT-ON* */
330 };
331
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash; /* used to mask matching swizzle components */
339 int mask; /* actual outmask */
340 int count; /* count of components matched */
341 } s_mask[] = {
342 /* *INDENT-OFF* */
343 {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
344 {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
345 {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
346 {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
347 {SWZ_X_MASK, 1, 1},
348 {SWZ_Y_MASK, 2, 1},
349 {SWZ_Z_MASK, 4, 1},
350 {PFS_INVAL, PFS_INVAL, PFS_INVAL}
351 /* *INDENT-ON* */
352 };
353
354 static const struct {
355 int base; /* hw value of swizzle */
356 int stride; /* difference between SRC0/1/2 */
357 GLuint flags;
358 } s_swiz[] = {
359 /* *INDENT-OFF* */
360 {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
361 {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
362 {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
363 {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
364 {R300_ALU_ARGA_ZERO, 0, 0},
365 {R300_ALU_ARGA_ONE, 0, 0},
366 {R300_ALU_ARGA_HALF, 0, 0}
367 /* *INDENT-ON* */
368 };
369
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef = REG(REG_TYPE_TEMP,
372 0,
373 SWIZZLE_XYZ,
374 SWIZZLE_W,
375 GL_FALSE,
376 GL_FALSE,
377 GL_FALSE);
378
379 /* constant one source */
380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
381 0,
382 SWIZZLE_111,
383 SWIZZLE_ONE,
384 GL_FALSE,
385 GL_TRUE,
386 GL_TRUE);
387
388 /* constant half source */
389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
390 0,
391 SWIZZLE_HHH,
392 SWIZZLE_HALF,
393 GL_FALSE,
394 GL_TRUE,
395 GL_TRUE);
396
397 /* constant zero source */
398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
399 0,
400 SWIZZLE_000,
401 SWIZZLE_ZERO,
402 GL_FALSE,
403 GL_TRUE,
404 GL_TRUE);
405
406 /*
407 * Common functions prototypes
408 */
409 static void dump_program(struct r300_fragment_program *fp,
410 struct r300_fragment_program_code *code);
411 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
412 GLuint dest, int mask,
413 GLuint src0, GLuint src1, GLuint src2, int flags);
414
415 /**
416 * Get an R300 temporary that can be written to in the given slot.
417 */
418 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
419 {
420 COMPILE_STATE;
421 int r;
422
423 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
424 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
425 break;
426 }
427
428 if (r >= PFS_NUM_TEMP_REGS) {
429 ERROR("Out of hardware temps\n");
430 return 0;
431 }
432 // Reserved is used to avoid the following scenario:
433 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
434 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
435 // Then scalar ops on Mesa temporary Z are emitted and move back in time
436 // to overwrite the value of temporary Y.
437 // End scenario.
438 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
439 cs->hwtemps[r].free = -1;
440
441 // Reset to some value that won't mess things up when the user
442 // tries to read from a temporary that hasn't been assigned a value yet.
443 // In the normal case, vector_valid and scalar_valid should be set to
444 // a sane value by the first emit that writes to this temporary.
445 cs->hwtemps[r].vector_valid = 0;
446 cs->hwtemps[r].scalar_valid = 0;
447
448 if (r > fp->code.max_temp_idx)
449 fp->code.max_temp_idx = r;
450
451 return r;
452 }
453
454 /**
455 * Get an R300 temporary that will act as a TEX destination register.
456 */
457 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
458 {
459 COMPILE_STATE;
460 int r;
461
462 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
463 if (cs->used_in_node & (1 << r))
464 continue;
465
466 // Note: Be very careful here
467 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
468 break;
469 }
470
471 if (r >= PFS_NUM_TEMP_REGS)
472 return get_hw_temp(cs, 0); /* Will cause an indirection */
473
474 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
475 cs->hwtemps[r].free = -1;
476
477 // Reset to some value that won't mess things up when the user
478 // tries to read from a temporary that hasn't been assigned a value yet.
479 // In the normal case, vector_valid and scalar_valid should be set to
480 // a sane value by the first emit that writes to this temporary.
481 cs->hwtemps[r].vector_valid = cs->nrslots;
482 cs->hwtemps[r].scalar_valid = cs->nrslots;
483
484 if (r > code->max_temp_idx)
485 code->max_temp_idx = r;
486
487 return r;
488 }
489
490 /**
491 * Mark the given hardware register as free.
492 */
493 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
494 {
495 // Be very careful here. Consider sequences like
496 // MAD r0, r1,r2,r3
497 // TEX r4, ...
498 // The TEX instruction may be moved in front of the MAD instruction
499 // due to the way nodes work. We don't want to alias r1 and r4 in
500 // this case.
501 // I'm certain the register allocation could be further sanitized,
502 // but it's tricky because of stuff that can happen inside emit_tex
503 // and emit_arith.
504 cs->hwtemps[idx].free = cs->nrslots + 1;
505 }
506
507 /**
508 * Create a new Mesa temporary register.
509 */
510 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
511 {
512 COMPILE_STATE;
513 GLuint r = undef;
514 GLuint index;
515
516 index = ffs(~cs->temp_in_use);
517 if (!index) {
518 ERROR("Out of program temps\n");
519 return r;
520 }
521
522 cs->temp_in_use |= (1 << --index);
523 cs->temps[index].refcount = 0xFFFFFFFF;
524 cs->temps[index].reg = -1;
525
526 REG_SET_TYPE(r, REG_TYPE_TEMP);
527 REG_SET_INDEX(r, index);
528 REG_SET_VALID(r, GL_TRUE);
529 return r;
530 }
531
532 /**
533 * Create a new Mesa temporary register that will act as the destination
534 * register for a texture read.
535 */
536 static GLuint get_temp_reg_tex(struct r300_pfs_compile_state *cs)
537 {
538 COMPILE_STATE;
539 GLuint r = undef;
540 GLuint index;
541
542 index = ffs(~cs->temp_in_use);
543 if (!index) {
544 ERROR("Out of program temps\n");
545 return r;
546 }
547
548 cs->temp_in_use |= (1 << --index);
549 cs->temps[index].refcount = 0xFFFFFFFF;
550 cs->temps[index].reg = get_hw_temp_tex(cs);
551
552 REG_SET_TYPE(r, REG_TYPE_TEMP);
553 REG_SET_INDEX(r, index);
554 REG_SET_VALID(r, GL_TRUE);
555 return r;
556 }
557
558 /**
559 * Free a Mesa temporary and the associated R300 temporary.
560 */
561 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
562 {
563 GLuint index = REG_GET_INDEX(r);
564
565 if (!(cs->temp_in_use & (1 << index)))
566 return;
567
568 if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
569 free_hw_temp(cs, cs->temps[index].reg);
570 cs->temps[index].reg = -1;
571 cs->temp_in_use &= ~(1 << index);
572 } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
573 free_hw_temp(cs, cs->inputs[index].reg);
574 cs->inputs[index].reg = -1;
575 }
576 }
577
578 /**
579 * Emit a hardware constant/parameter.
580 *
581 * \p cp Stable pointer to an array of 4 floats.
582 * The pointer must be stable in the sense that it remains to be valid
583 * and hold the contents of the constant/parameter throughout the lifetime
584 * of the fragment program (actually, up until the next time the fragment
585 * program is translated).
586 */
587 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
588 const GLfloat * cp)
589 {
590 COMPILE_STATE;
591 GLuint reg = undef;
592 int index;
593
594 for (index = 0; index < code->const_nr; ++index) {
595 if (code->constant[index] == cp)
596 break;
597 }
598
599 if (index >= code->const_nr) {
600 if (index >= PFS_NUM_CONST_REGS) {
601 ERROR("Out of hw constants!\n");
602 return reg;
603 }
604
605 code->const_nr++;
606 code->constant[index] = cp;
607 }
608
609 REG_SET_TYPE(reg, REG_TYPE_CONST);
610 REG_SET_INDEX(reg, index);
611 REG_SET_VALID(reg, GL_TRUE);
612 return reg;
613 }
614
615 static inline GLuint negate(GLuint r)
616 {
617 REG_NEGS(r);
618 REG_NEGV(r);
619 return r;
620 }
621
622 /* Hack, to prevent clobbering sources used multiple times when
623 * emulating non-native instructions
624 */
625 static inline GLuint keep(GLuint r)
626 {
627 REG_SET_NO_USE(r, GL_TRUE);
628 return r;
629 }
630
631 static inline GLuint absolute(GLuint r)
632 {
633 REG_ABS(r);
634 return r;
635 }
636
637 static int swz_native(struct r300_pfs_compile_state *cs,
638 GLuint src, GLuint * r, GLuint arbneg)
639 {
640 COMPILE_STATE;
641
642 /* Native swizzle, handle negation */
643 src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
644
645 if ((arbneg & 0x7) == 0x0) {
646 src = src & ~REG_NEGV_MASK;
647 *r = src;
648 } else if ((arbneg & 0x7) == 0x7) {
649 src |= REG_NEGV_MASK;
650 *r = src;
651 } else {
652 if (!REG_GET_VALID(*r))
653 *r = get_temp_reg(cs);
654 src |= REG_NEGV_MASK;
655 emit_arith(cs,
656 PFS_OP_MAD,
657 *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
658 src = src & ~REG_NEGV_MASK;
659 emit_arith(cs,
660 PFS_OP_MAD,
661 *r,
662 (arbneg ^ 0x7) | WRITEMASK_W,
663 src, pfs_one, pfs_zero, 0);
664 }
665
666 return 3;
667 }
668
669 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
670 GLuint src,
671 GLuint * r, int mask, int mc, GLuint arbneg)
672 {
673 COMPILE_STATE;
674 GLuint tmp;
675 GLuint wmask = 0;
676
677 if (!REG_GET_VALID(*r))
678 *r = get_temp_reg(cs);
679
680 /* A partial match, VSWZ/mask define what parts of the
681 * desired swizzle we match
682 */
683 if (mc + s_mask[mask].count == 3) {
684 wmask = WRITEMASK_W;
685 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
686 }
687
688 tmp = arbneg & s_mask[mask].mask;
689 if (tmp) {
690 tmp = tmp ^ s_mask[mask].mask;
691 if (tmp) {
692 emit_arith(cs,
693 PFS_OP_MAD,
694 *r,
695 arbneg & s_mask[mask].mask,
696 keep(src) | REG_NEGV_MASK,
697 pfs_one, pfs_zero, 0);
698 if (!wmask) {
699 REG_SET_NO_USE(src, GL_TRUE);
700 } else {
701 REG_SET_NO_USE(src, GL_FALSE);
702 }
703 emit_arith(cs,
704 PFS_OP_MAD,
705 *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
706 } else {
707 if (!wmask) {
708 REG_SET_NO_USE(src, GL_TRUE);
709 } else {
710 REG_SET_NO_USE(src, GL_FALSE);
711 }
712 emit_arith(cs,
713 PFS_OP_MAD,
714 *r,
715 (arbneg & s_mask[mask].mask) | wmask,
716 src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
717 }
718 } else {
719 if (!wmask) {
720 REG_SET_NO_USE(src, GL_TRUE);
721 } else {
722 REG_SET_NO_USE(src, GL_FALSE);
723 }
724 emit_arith(cs, PFS_OP_MAD,
725 *r,
726 s_mask[mask].mask | wmask,
727 src, pfs_one, pfs_zero, 0);
728 }
729
730 return s_mask[mask].count;
731 }
732
733 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
734 GLuint src, GLuint arbswz, GLuint arbneg)
735 {
736 COMPILE_STATE;
737 GLuint r = undef;
738 GLuint vswz;
739 int c_mask = 0;
740 int v_match = 0;
741
742 /* If swizzling from something without an XYZW native swizzle,
743 * emit result to a temp, and do new swizzle from the temp.
744 */
745 #if 0
746 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
747 GLuint temp = get_temp_reg(fp);
748 emit_arith(fp,
749 PFS_OP_MAD,
750 temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
751 src = temp;
752 }
753 #endif
754
755 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
756 GLuint vsrcswz =
757 (v_swiz[REG_GET_VSWZ(src)].
758 hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
759 REG_GET_SSWZ(src) << 9;
760 GLint i;
761
762 GLuint newswz = 0;
763 GLuint offset;
764 for (i = 0; i < 4; ++i) {
765 offset = GET_SWZ(arbswz, i);
766
767 newswz |=
768 (offset <= 3) ? GET_SWZ(vsrcswz,
769 offset) << i *
770 3 : offset << i * 3;
771 }
772
773 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
774 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
775 } else {
776 /* set scalar swizzling */
777 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
778
779 }
780 do {
781 vswz = REG_GET_VSWZ(src);
782 do {
783 int chash;
784
785 REG_SET_VSWZ(src, vswz);
786 chash = v_swiz[REG_GET_VSWZ(src)].hash &
787 s_mask[c_mask].hash;
788
789 if (chash == (arbswz & s_mask[c_mask].hash)) {
790 if (s_mask[c_mask].count == 3) {
791 v_match += swz_native(cs,
792 src, &r, arbneg);
793 } else {
794 v_match += swz_emit_partial(cs,
795 src,
796 &r,
797 c_mask,
798 v_match,
799 arbneg);
800 }
801
802 if (v_match == 3)
803 return r;
804
805 /* Fill with something invalid.. all 0's was
806 * wrong before, matched SWIZZLE_X. So all
807 * 1's will be okay for now
808 */
809 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
810 }
811 } while (v_swiz[++vswz].hash != PFS_INVAL);
812 REG_SET_VSWZ(src, SWIZZLE_XYZ);
813 } while (s_mask[++c_mask].hash != PFS_INVAL);
814
815 ERROR("should NEVER get here\n");
816 return r;
817 }
818
819 static GLuint t_src(struct r300_pfs_compile_state *cs,
820 struct prog_src_register fpsrc)
821 {
822 COMPILE_STATE;
823 GLuint r = undef;
824
825 switch (fpsrc.File) {
826 case PROGRAM_TEMPORARY:
827 REG_SET_INDEX(r, fpsrc.Index);
828 REG_SET_VALID(r, GL_TRUE);
829 REG_SET_TYPE(r, REG_TYPE_TEMP);
830 break;
831 case PROGRAM_INPUT:
832 REG_SET_INDEX(r, fpsrc.Index);
833 REG_SET_VALID(r, GL_TRUE);
834 REG_SET_TYPE(r, REG_TYPE_INPUT);
835 break;
836 case PROGRAM_LOCAL_PARAM:
837 r = emit_const4fv(cs,
838 fp->mesa_program.Base.LocalParams[fpsrc.
839 Index]);
840 break;
841 case PROGRAM_ENV_PARAM:
842 r = emit_const4fv(cs,
843 cs->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
844 break;
845 case PROGRAM_STATE_VAR:
846 case PROGRAM_NAMED_PARAM:
847 case PROGRAM_CONSTANT:
848 r = emit_const4fv(cs,
849 fp->mesa_program.Base.Parameters->
850 ParameterValues[fpsrc.Index]);
851 break;
852 default:
853 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
854 return r;
855 }
856
857 /* no point swizzling ONE/ZERO/HALF constants... */
858 if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
859 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
860 return r;
861 }
862
863 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
864 struct prog_src_register fpsrc)
865 {
866 struct prog_src_register src = fpsrc;
867 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
868
869 src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
870
871 return t_src(cs, src);
872 }
873
874 static GLuint t_dst(struct r300_pfs_compile_state *cs,
875 struct prog_dst_register dest)
876 {
877 COMPILE_STATE;
878 GLuint r = undef;
879
880 switch (dest.File) {
881 case PROGRAM_TEMPORARY:
882 REG_SET_INDEX(r, dest.Index);
883 REG_SET_VALID(r, GL_TRUE);
884 REG_SET_TYPE(r, REG_TYPE_TEMP);
885 return r;
886 case PROGRAM_OUTPUT:
887 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
888 switch (dest.Index) {
889 case FRAG_RESULT_COLR:
890 case FRAG_RESULT_DEPR:
891 REG_SET_INDEX(r, dest.Index);
892 REG_SET_VALID(r, GL_TRUE);
893 return r;
894 default:
895 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
896 return r;
897 }
898 default:
899 ERROR("Bad DstReg->File 0x%x\n", dest.File);
900 return r;
901 }
902 }
903
904 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
905 {
906 COMPILE_STATE;
907 int idx;
908 int index = REG_GET_INDEX(src);
909
910 switch (REG_GET_TYPE(src)) {
911 case REG_TYPE_TEMP:
912 /* NOTE: if reg==-1 here, a source is being read that
913 * hasn't been written to. Undefined results.
914 */
915 if (cs->temps[index].reg == -1)
916 cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
917
918 idx = cs->temps[index].reg;
919
920 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
921 free_temp(cs, src);
922 break;
923 case REG_TYPE_INPUT:
924 idx = cs->inputs[index].reg;
925
926 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
927 free_hw_temp(cs, cs->inputs[index].reg);
928 break;
929 case REG_TYPE_CONST:
930 return (index | SRC_CONST);
931 default:
932 ERROR("Invalid type for source reg\n");
933 return (0 | SRC_CONST);
934 }
935
936 if (!tex)
937 cs->used_in_node |= (1 << idx);
938
939 return idx;
940 }
941
942 static int t_hw_dst(struct r300_pfs_compile_state *cs,
943 GLuint dest, GLboolean tex, int slot)
944 {
945 COMPILE_STATE;
946 int idx;
947 GLuint index = REG_GET_INDEX(dest);
948 assert(REG_GET_VALID(dest));
949
950 switch (REG_GET_TYPE(dest)) {
951 case REG_TYPE_TEMP:
952 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
953 if (!tex) {
954 cs->temps[index].reg = get_hw_temp(cs, slot);
955 } else {
956 cs->temps[index].reg = get_hw_temp_tex(cs);
957 }
958 }
959 idx = cs->temps[index].reg;
960
961 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
962 free_temp(cs, dest);
963
964 cs->dest_in_node |= (1 << idx);
965 cs->used_in_node |= (1 << idx);
966 break;
967 case REG_TYPE_OUTPUT:
968 switch (index) {
969 case FRAG_RESULT_COLR:
970 code->node[code->cur_node].flags |= R300_RGBA_OUT;
971 break;
972 case FRAG_RESULT_DEPR:
973 fp->WritesDepth = GL_TRUE;
974 code->node[code->cur_node].flags |= R300_W_OUT;
975 break;
976 }
977 return index;
978 break;
979 default:
980 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
981 return 0;
982 }
983
984 return idx;
985 }
986
987 static void emit_nop(struct r300_pfs_compile_state *cs)
988 {
989 COMPILE_STATE;
990
991 if (cs->nrslots >= PFS_MAX_ALU_INST) {
992 ERROR("Out of ALU instruction slots\n");
993 return;
994 }
995
996 code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
997 code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
998 code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
999 code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
1000 cs->nrslots++;
1001 }
1002
1003 static void emit_tex(struct r300_pfs_compile_state *cs,
1004 struct prog_instruction *fpi, int opcode)
1005 {
1006 COMPILE_STATE;
1007 GLuint coord = t_src(cs, fpi->SrcReg[0]);
1008 GLuint dest = undef, rdest = undef;
1009 GLuint din, uin;
1010 int unit = fpi->TexSrcUnit;
1011 int hwsrc, hwdest;
1012 GLuint tempreg = 0;
1013
1014 /**
1015 * Hardware uses [0..1]x[0..1] range for rectangle textures
1016 * instead of [0..Width]x[0..Height].
1017 * Add a scaling instruction.
1018 *
1019 * \todo Refactor this once we have proper rewriting/optimization
1020 * support for programs.
1021 */
1022 if (opcode != R300_TEX_OP_KIL && fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
1023 gl_state_index tokens[STATE_LENGTH] = {
1024 STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
1025 0
1026 };
1027 int factor_index;
1028 GLuint factorreg;
1029
1030 tokens[2] = unit;
1031 factor_index =
1032 _mesa_add_state_reference(cs->fp->mesa_program.Base.
1033 Parameters, tokens);
1034 factorreg =
1035 emit_const4fv(cs,
1036 cs->fp->mesa_program.Base.Parameters->
1037 ParameterValues[factor_index]);
1038 tempreg = keep(get_temp_reg(cs));
1039
1040 emit_arith(cs, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
1041 coord, factorreg, pfs_zero, 0);
1042
1043 coord = tempreg;
1044 }
1045
1046 /* Texture operations do not support swizzles etc. in hardware,
1047 * so emit an additional arithmetic operation if necessary.
1048 */
1049 if (REG_GET_VSWZ(coord) != SWIZZLE_XYZ ||
1050 REG_GET_SSWZ(coord) != SWIZZLE_W ||
1051 coord & (REG_NEGV_MASK | REG_NEGS_MASK | REG_ABS_MASK)) {
1052 assert(tempreg == 0);
1053 tempreg = keep(get_temp_reg(cs));
1054 emit_arith(cs, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
1055 coord, pfs_one, pfs_zero, 0);
1056 coord = tempreg;
1057 }
1058
1059 /* Ensure correct node indirection */
1060 uin = cs->used_in_node;
1061 din = cs->dest_in_node;
1062
1063 /* Resolve source/dest to hardware registers */
1064 hwsrc = t_hw_src(cs, coord, GL_TRUE);
1065
1066 if (opcode != R300_TEX_OP_KIL) {
1067 dest = t_dst(cs, fpi->DstReg);
1068
1069 /* r300 doesn't seem to be able to do TEX->output reg */
1070 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1071 rdest = dest;
1072 dest = get_temp_reg_tex(cs);
1073 } else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
1074 /* in case write mask isn't XYZW */
1075 rdest = dest;
1076 dest = get_temp_reg_tex(cs);
1077 }
1078 hwdest =
1079 t_hw_dst(cs, dest, GL_TRUE,
1080 code->node[code->cur_node].alu_offset);
1081
1082 /* Use a temp that hasn't been used in this node, rather
1083 * than causing an indirection
1084 */
1085 if (uin & (1 << hwdest)) {
1086 free_hw_temp(cs, hwdest);
1087 hwdest = get_hw_temp_tex(cs);
1088 cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1089 }
1090 } else {
1091 hwdest = 0;
1092 unit = 0;
1093 }
1094
1095 /* Indirection if source has been written in this node, or if the
1096 * dest has been read/written in this node
1097 */
1098 if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1099 (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1100
1101 /* Finish off current node */
1102 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1103 emit_nop(cs);
1104
1105 code->node[code->cur_node].alu_end =
1106 cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1107 assert(code->node[code->cur_node].alu_end >= 0);
1108
1109 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1110 ERROR("too many levels of texture indirection\n");
1111 return;
1112 }
1113
1114 /* Start new node */
1115 code->node[code->cur_node].tex_offset = code->tex.length;
1116 code->node[code->cur_node].alu_offset = cs->nrslots;
1117 code->node[code->cur_node].tex_end = -1;
1118 code->node[code->cur_node].alu_end = -1;
1119 code->node[code->cur_node].flags = 0;
1120 cs->used_in_node = 0;
1121 cs->dest_in_node = 0;
1122 }
1123
1124 if (code->cur_node == 0)
1125 code->first_node_has_tex = 1;
1126
1127 code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1128 | (hwdest << R300_DST_ADDR_SHIFT)
1129 | (unit << R300_TEX_ID_SHIFT)
1130 | (opcode << R300_TEX_INST_SHIFT);
1131
1132 cs->dest_in_node |= (1 << hwdest);
1133 if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1134 cs->used_in_node |= (1 << hwsrc);
1135
1136 code->node[code->cur_node].tex_end++;
1137
1138 /* Copy from temp to output if needed */
1139 if (REG_GET_VALID(rdest)) {
1140 emit_arith(cs, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
1141 pfs_one, pfs_zero, 0);
1142 free_temp(cs, dest);
1143 }
1144
1145 /* Free temp register */
1146 if (tempreg != 0)
1147 free_temp(cs, tempreg);
1148 }
1149
1150 /**
1151 * Returns the first slot where we could possibly allow writing to dest,
1152 * according to register allocation.
1153 */
1154 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1155 GLuint dest, int mask)
1156 {
1157 COMPILE_STATE;
1158 int idx;
1159 int pos;
1160 GLuint index = REG_GET_INDEX(dest);
1161 assert(REG_GET_VALID(dest));
1162
1163 switch (REG_GET_TYPE(dest)) {
1164 case REG_TYPE_TEMP:
1165 if (cs->temps[index].reg == -1)
1166 return 0;
1167
1168 idx = cs->temps[index].reg;
1169 break;
1170 case REG_TYPE_OUTPUT:
1171 return 0;
1172 default:
1173 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1174 return 0;
1175 }
1176
1177 pos = cs->hwtemps[idx].reserved;
1178 if (mask & WRITEMASK_XYZ) {
1179 if (pos < cs->hwtemps[idx].vector_lastread)
1180 pos = cs->hwtemps[idx].vector_lastread;
1181 }
1182 if (mask & WRITEMASK_W) {
1183 if (pos < cs->hwtemps[idx].scalar_lastread)
1184 pos = cs->hwtemps[idx].scalar_lastread;
1185 }
1186
1187 return pos;
1188 }
1189
1190 /**
1191 * Allocates a slot for an ALU instruction that can consist of
1192 * a vertex part or a scalar part or both.
1193 *
1194 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1195 * appropriate position (vector and/or scalar), and their positions are
1196 * recorded in the srcpos array.
1197 *
1198 * This function emits instruction code for the source fetch and the
1199 * argument selection. It does not emit instruction code for the
1200 * opcode or the destination selection.
1201 *
1202 * @return the index of the slot
1203 */
1204 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1205 GLboolean emit_vop,
1206 GLboolean emit_sop,
1207 int argc, GLuint * src, GLuint dest, int mask)
1208 {
1209 COMPILE_STATE;
1210 int hwsrc[3];
1211 int srcpos[3];
1212 unsigned int used;
1213 int tempused;
1214 int tempvsrc[3];
1215 int tempssrc[3];
1216 int pos;
1217 int regnr;
1218 int i, j;
1219
1220 // Determine instruction slots, whether sources are required on
1221 // vector or scalar side, and the smallest slot number where
1222 // all source registers are available
1223 used = 0;
1224 if (emit_vop)
1225 used |= SLOT_OP_VECTOR;
1226 if (emit_sop)
1227 used |= SLOT_OP_SCALAR;
1228
1229 pos = get_earliest_allowed_write(cs, dest, mask);
1230
1231 if (code->node[code->cur_node].alu_offset > pos)
1232 pos = code->node[code->cur_node].alu_offset;
1233 for (i = 0; i < argc; ++i) {
1234 if (!REG_GET_BUILTIN(src[i])) {
1235 if (emit_vop)
1236 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1237 if (emit_sop)
1238 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1239 }
1240
1241 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1242 regnr = hwsrc[i] & 31;
1243
1244 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1245 if (used & (SLOT_SRC_VECTOR << i)) {
1246 if (cs->hwtemps[regnr].vector_valid > pos)
1247 pos = cs->hwtemps[regnr].vector_valid;
1248 }
1249 if (used & (SLOT_SRC_SCALAR << i)) {
1250 if (cs->hwtemps[regnr].scalar_valid > pos)
1251 pos = cs->hwtemps[regnr].scalar_valid;
1252 }
1253 }
1254 }
1255
1256 // Find a slot that fits
1257 for (;; ++pos) {
1258 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1259 continue;
1260
1261 if (pos >= cs->nrslots) {
1262 if (cs->nrslots >= PFS_MAX_ALU_INST) {
1263 ERROR("Out of ALU instruction slots\n");
1264 return -1;
1265 }
1266
1267 fp->code.alu.inst[pos].inst0 = NOP_INST0;
1268 fp->code.alu.inst[pos].inst1 = NOP_INST1;
1269 fp->code.alu.inst[pos].inst2 = NOP_INST2;
1270 fp->code.alu.inst[pos].inst3 = NOP_INST3;
1271
1272 cs->nrslots++;
1273 }
1274 // Note: When we need both parts (vector and scalar) of a source,
1275 // we always try to put them into the same position. This makes the
1276 // code easier to read, and it is optimal (i.e. one doesn't gain
1277 // anything by splitting the parts).
1278 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1279 tempused = cs->slot[pos].used;
1280 for (i = 0; i < 3; ++i) {
1281 tempvsrc[i] = cs->slot[pos].vsrc[i];
1282 tempssrc[i] = cs->slot[pos].ssrc[i];
1283 }
1284
1285 for (i = 0; i < argc; ++i) {
1286 int flags = (used >> i) & SLOT_SRC_BOTH;
1287
1288 if (!flags) {
1289 srcpos[i] = 0;
1290 continue;
1291 }
1292
1293 for (j = 0; j < 3; ++j) {
1294 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1295 if (tempvsrc[j] != hwsrc[i])
1296 continue;
1297 }
1298
1299 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1300 if (tempssrc[j] != hwsrc[i])
1301 continue;
1302 }
1303
1304 break;
1305 }
1306
1307 if (j == 3)
1308 break;
1309
1310 srcpos[i] = j;
1311 tempused |= flags << j;
1312 if (flags & SLOT_SRC_VECTOR)
1313 tempvsrc[j] = hwsrc[i];
1314 if (flags & SLOT_SRC_SCALAR)
1315 tempssrc[j] = hwsrc[i];
1316 }
1317
1318 if (i == argc)
1319 break;
1320 }
1321
1322 // Found a slot, reserve it
1323 cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1324 for (i = 0; i < 3; ++i) {
1325 cs->slot[pos].vsrc[i] = tempvsrc[i];
1326 cs->slot[pos].ssrc[i] = tempssrc[i];
1327 }
1328
1329 for (i = 0; i < argc; ++i) {
1330 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1331 int regnr = hwsrc[i] & 31;
1332
1333 if (used & (SLOT_SRC_VECTOR << i)) {
1334 if (cs->hwtemps[regnr].vector_lastread < pos)
1335 cs->hwtemps[regnr].vector_lastread =
1336 pos;
1337 }
1338 if (used & (SLOT_SRC_SCALAR << i)) {
1339 if (cs->hwtemps[regnr].scalar_lastread < pos)
1340 cs->hwtemps[regnr].scalar_lastread =
1341 pos;
1342 }
1343 }
1344 }
1345
1346 // Emit the source fetch code
1347 code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1348 code->alu.inst[pos].inst1 |=
1349 ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1350 (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1351 (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1352
1353 code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1354 code->alu.inst[pos].inst3 |=
1355 ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1356 (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1357 (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1358
1359 // Emit the argument selection code
1360 if (emit_vop) {
1361 int swz[3];
1362
1363 for (i = 0; i < 3; ++i) {
1364 if (i < argc) {
1365 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1366 (srcpos[i] *
1367 v_swiz[REG_GET_VSWZ(src[i])].
1368 stride)) | ((src[i] & REG_NEGV_MASK)
1369 ? ARG_NEG : 0) | ((src[i]
1370 &
1371 REG_ABS_MASK)
1372 ?
1373 ARG_ABS
1374 : 0);
1375 } else {
1376 swz[i] = R300_ALU_ARGC_ZERO;
1377 }
1378 }
1379
1380 code->alu.inst[pos].inst0 &=
1381 ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1382 R300_ALU_ARG2C_MASK);
1383 code->alu.inst[pos].inst0 |=
1384 (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1385 R300_ALU_ARG1C_SHIFT)
1386 | (swz[2] << R300_ALU_ARG2C_SHIFT);
1387 }
1388
1389 if (emit_sop) {
1390 int swz[3];
1391
1392 for (i = 0; i < 3; ++i) {
1393 if (i < argc) {
1394 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1395 (srcpos[i] *
1396 s_swiz[REG_GET_SSWZ(src[i])].
1397 stride)) | ((src[i] & REG_NEGV_MASK)
1398 ? ARG_NEG : 0) | ((src[i]
1399 &
1400 REG_ABS_MASK)
1401 ?
1402 ARG_ABS
1403 : 0);
1404 } else {
1405 swz[i] = R300_ALU_ARGA_ZERO;
1406 }
1407 }
1408
1409 code->alu.inst[pos].inst2 &=
1410 ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1411 R300_ALU_ARG2A_MASK);
1412 code->alu.inst[pos].inst2 |=
1413 (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1414 R300_ALU_ARG1A_SHIFT)
1415 | (swz[2] << R300_ALU_ARG2A_SHIFT);
1416 }
1417
1418 return pos;
1419 }
1420
1421 /**
1422 * Append an ALU instruction to the instruction list.
1423 */
1424 static void emit_arith(struct r300_pfs_compile_state *cs,
1425 int op,
1426 GLuint dest,
1427 int mask,
1428 GLuint src0, GLuint src1, GLuint src2, int flags)
1429 {
1430 COMPILE_STATE;
1431 GLuint src[3] = { src0, src1, src2 };
1432 int hwdest;
1433 GLboolean emit_vop, emit_sop;
1434 int vop, sop, argc;
1435 int pos;
1436
1437 vop = r300_fpop[op].v_op;
1438 sop = r300_fpop[op].s_op;
1439 argc = r300_fpop[op].argc;
1440
1441 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1442 REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1443 if (mask & WRITEMASK_Z) {
1444 mask = WRITEMASK_W;
1445 } else {
1446 return;
1447 }
1448 }
1449
1450 emit_vop = GL_FALSE;
1451 emit_sop = GL_FALSE;
1452 if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1453 emit_vop = GL_TRUE;
1454 if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1455 emit_sop = GL_TRUE;
1456
1457 pos =
1458 find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1459 mask);
1460 if (pos < 0)
1461 return;
1462
1463 hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1464
1465 if (flags & PFS_FLAG_SAT) {
1466 vop |= R300_ALU_OUTC_CLAMP;
1467 sop |= R300_ALU_OUTA_CLAMP;
1468 }
1469
1470 /* Throw the pieces together and get ALU/1 */
1471 if (emit_vop) {
1472 code->alu.inst[pos].inst0 |= vop;
1473
1474 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1475
1476 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1477 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1478 code->alu.inst[pos].inst1 |=
1479 (mask & WRITEMASK_XYZ) <<
1480 R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1481 } else
1482 assert(0);
1483 } else {
1484 code->alu.inst[pos].inst1 |=
1485 (mask & WRITEMASK_XYZ) <<
1486 R300_ALU_DSTC_REG_MASK_SHIFT;
1487
1488 cs->hwtemps[hwdest].vector_valid = pos + 1;
1489 }
1490 }
1491
1492 /* And now ALU/3 */
1493 if (emit_sop) {
1494 code->alu.inst[pos].inst2 |= sop;
1495
1496 if (mask & WRITEMASK_W) {
1497 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1498 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1499 code->alu.inst[pos].inst3 |=
1500 (hwdest << R300_ALU_DSTA_SHIFT) |
1501 R300_ALU_DSTA_OUTPUT;
1502 } else if (REG_GET_INDEX(dest) ==
1503 FRAG_RESULT_DEPR) {
1504 code->alu.inst[pos].inst3 |=
1505 R300_ALU_DSTA_DEPTH;
1506 } else
1507 assert(0);
1508 } else {
1509 code->alu.inst[pos].inst3 |=
1510 (hwdest << R300_ALU_DSTA_SHIFT) |
1511 R300_ALU_DSTA_REG;
1512
1513 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1514 }
1515 }
1516 }
1517
1518 return;
1519 }
1520
1521 #if 0
1522 static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
1523 {
1524 struct gl_fragment_program *mp = &fp->mesa_program;
1525 GLuint r = undef;
1526
1527 if (!(mp->Base.InputsRead & (1 << attr))) {
1528 ERROR("Attribute %d was not provided!\n", attr);
1529 return undef;
1530 }
1531
1532 REG_SET_TYPE(r, REG_TYPE_INPUT);
1533 REG_SET_INDEX(r, attr);
1534 REG_SET_VALID(r, GL_TRUE);
1535 return r;
1536 }
1537 #endif
1538
1539 static GLfloat SinCosConsts[2][4] = {
1540 {
1541 1.273239545, // 4/PI
1542 -0.405284735, // -4/(PI*PI)
1543 3.141592654, // PI
1544 0.2225 // weight
1545 },
1546 {
1547 0.75,
1548 0.0,
1549 0.159154943, // 1/(2*PI)
1550 6.283185307 // 2*PI
1551 }
1552 };
1553
1554 /**
1555 * Emit a LIT instruction.
1556 * \p flags may be PFS_FLAG_SAT
1557 *
1558 * Definition of LIT (from ARB_fragment_program):
1559 * tmp = VectorLoad(op0);
1560 * if (tmp.x < 0) tmp.x = 0;
1561 * if (tmp.y < 0) tmp.y = 0;
1562 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1563 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1564 * result.x = 1.0;
1565 * result.y = tmp.x;
1566 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1567 * result.w = 1.0;
1568 *
1569 * The longest path of computation is the one leading to result.z,
1570 * consisting of 5 operations. This implementation of LIT takes
1571 * 5 slots. So unless there's some special undocumented opcode,
1572 * this implementation is potentially optimal. Unfortunately,
1573 * emit_arith is a bit too conservative because it doesn't understand
1574 * partial writes to the vector component.
1575 */
1576 static const GLfloat LitConst[4] =
1577 { 127.999999, 127.999999, 127.999999, -127.999999 };
1578
1579 static void emit_lit(struct r300_pfs_compile_state *cs,
1580 GLuint dest, int mask, GLuint src, int flags)
1581 {
1582 COMPILE_STATE;
1583 GLuint cnst;
1584 int needTemporary;
1585 GLuint temp;
1586
1587 cnst = emit_const4fv(cs, LitConst);
1588
1589 needTemporary = 0;
1590 if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1591 needTemporary = 1;
1592 } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1593 // LIT is typically followed by DP3/DP4, so there's no point
1594 // in creating special code for this case
1595 needTemporary = 1;
1596 }
1597
1598 if (needTemporary) {
1599 temp = keep(get_temp_reg(cs));
1600 } else {
1601 temp = keep(dest);
1602 }
1603
1604 // Note: The order of emit_arith inside the slots is relevant,
1605 // because emit_arith only looks at scalar vs. vector when resolving
1606 // dependencies, and it does not consider individual vector components,
1607 // so swizzling between the two parts can create fake dependencies.
1608
1609 // First slot
1610 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1611 keep(src), pfs_zero, undef, 0);
1612 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1613
1614 // Second slot
1615 emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1616 swizzle(temp, W, W, W, W), cnst, undef, 0);
1617 emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1618 swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1619
1620 // Third slot
1621 // If desired, we saturate the y result here.
1622 // This does not affect the use as a condition variable in the CMP later
1623 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1624 temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1625 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1626 swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1627
1628 // Fourth slot
1629 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1630 pfs_one, pfs_one, pfs_zero, 0);
1631 emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1632
1633 // Fifth slot
1634 emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1635 pfs_zero, swizzle(temp, W, W, W, W),
1636 negate(swizzle(temp, Y, Y, Y, Y)), flags);
1637 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1638 pfs_zero, 0);
1639
1640 if (needTemporary) {
1641 emit_arith(cs, PFS_OP_MAD, dest, mask,
1642 temp, pfs_one, pfs_zero, flags);
1643 free_temp(cs, temp);
1644 } else {
1645 // Decrease refcount of the destination
1646 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1647 }
1648 }
1649
1650 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1651 {
1652 COMPILE_STATE;
1653 struct gl_fragment_program *mp = &fp->mesa_program;
1654 const struct prog_instruction *inst = mp->Base.Instructions;
1655 struct prog_instruction *fpi;
1656 GLuint src[3], dest, temp[2];
1657 int flags, mask = 0;
1658 int const_sin[2];
1659
1660 if (!inst || inst[0].Opcode == OPCODE_END) {
1661 ERROR("empty program?\n");
1662 return GL_FALSE;
1663 }
1664
1665 for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1666 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1667 flags = PFS_FLAG_SAT;
1668 else
1669 flags = 0;
1670
1671 if (fpi->Opcode != OPCODE_KIL) {
1672 dest = t_dst(cs, fpi->DstReg);
1673 mask = fpi->DstReg.WriteMask;
1674 }
1675
1676 switch (fpi->Opcode) {
1677 case OPCODE_ABS:
1678 src[0] = t_src(cs, fpi->SrcReg[0]);
1679 emit_arith(cs, PFS_OP_MAD, dest, mask,
1680 absolute(src[0]), pfs_one, pfs_zero, flags);
1681 break;
1682 case OPCODE_ADD:
1683 src[0] = t_src(cs, fpi->SrcReg[0]);
1684 src[1] = t_src(cs, fpi->SrcReg[1]);
1685 emit_arith(cs, PFS_OP_MAD, dest, mask,
1686 src[0], pfs_one, src[1], flags);
1687 break;
1688 case OPCODE_CMP:
1689 src[0] = t_src(cs, fpi->SrcReg[0]);
1690 src[1] = t_src(cs, fpi->SrcReg[1]);
1691 src[2] = t_src(cs, fpi->SrcReg[2]);
1692 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1693 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1694 */
1695 emit_arith(cs, PFS_OP_CMP, dest, mask,
1696 src[2], src[1], src[0], flags);
1697 break;
1698 case OPCODE_COS:
1699 /*
1700 * cos using a parabola (see SIN):
1701 * cos(x):
1702 * x = (x/(2*PI))+0.75
1703 * x = frac(x)
1704 * x = (x*2*PI)-PI
1705 * result = sin(x)
1706 */
1707 temp[0] = get_temp_reg(cs);
1708 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1709 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1710 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1711
1712 /* add 0.5*PI and do range reduction */
1713
1714 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1715 swizzle(src[0], X, X, X, X),
1716 swizzle(const_sin[1], Z, Z, Z, Z),
1717 swizzle(const_sin[1], X, X, X, X), 0);
1718
1719 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1720 swizzle(temp[0], X, X, X, X),
1721 undef, undef, 0);
1722
1723 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
1724 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
1725 0);
1726
1727 /* SIN */
1728
1729 emit_arith(cs, PFS_OP_MAD, temp[0],
1730 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1731 Z, Z, Z,
1732 Z),
1733 const_sin[0], pfs_zero, 0);
1734
1735 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1736 swizzle(temp[0], Y, Y, Y, Y),
1737 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1738 swizzle(temp[0], X, X, X, X), 0);
1739
1740 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1741 swizzle(temp[0], X, X, X, X),
1742 absolute(swizzle(temp[0], X, X, X, X)),
1743 negate(swizzle(temp[0], X, X, X, X)), 0);
1744
1745 emit_arith(cs, PFS_OP_MAD, dest, mask,
1746 swizzle(temp[0], Y, Y, Y, Y),
1747 swizzle(const_sin[0], W, W, W, W),
1748 swizzle(temp[0], X, X, X, X), flags);
1749
1750 free_temp(cs, temp[0]);
1751 break;
1752 case OPCODE_DP3:
1753 src[0] = t_src(cs, fpi->SrcReg[0]);
1754 src[1] = t_src(cs, fpi->SrcReg[1]);
1755 emit_arith(cs, PFS_OP_DP3, dest, mask,
1756 src[0], src[1], undef, flags);
1757 break;
1758 case OPCODE_DP4:
1759 src[0] = t_src(cs, fpi->SrcReg[0]);
1760 src[1] = t_src(cs, fpi->SrcReg[1]);
1761 emit_arith(cs, PFS_OP_DP4, dest, mask,
1762 src[0], src[1], undef, flags);
1763 break;
1764 case OPCODE_DPH:
1765 src[0] = t_src(cs, fpi->SrcReg[0]);
1766 src[1] = t_src(cs, fpi->SrcReg[1]);
1767 /* src0.xyz1 -> temp
1768 * DP4 dest, temp, src1
1769 */
1770 emit_arith(cs, PFS_OP_DP4, dest, mask,
1771 swizzle(src[0], X, Y, Z, ONE), src[1],
1772 undef, flags);
1773 break;
1774 case OPCODE_DST:
1775 src[0] = t_src(cs, fpi->SrcReg[0]);
1776 src[1] = t_src(cs, fpi->SrcReg[1]);
1777 /* dest.y = src0.y * src1.y */
1778 if (mask & WRITEMASK_Y)
1779 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1780 keep(src[0]), keep(src[1]),
1781 pfs_zero, flags);
1782 /* dest.z = src0.z */
1783 if (mask & WRITEMASK_Z)
1784 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1785 src[0], pfs_one, pfs_zero, flags);
1786 /* result.x = 1.0
1787 * result.w = src1.w */
1788 if (mask & WRITEMASK_XW) {
1789 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */
1790 emit_arith(cs, PFS_OP_MAD, dest,
1791 mask & WRITEMASK_XW,
1792 src[1], pfs_one, pfs_zero, flags);
1793 }
1794 break;
1795 case OPCODE_EX2:
1796 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1797 emit_arith(cs, PFS_OP_EX2, dest, mask,
1798 src[0], undef, undef, flags);
1799 break;
1800 case OPCODE_FLR:
1801 src[0] = t_src(cs, fpi->SrcReg[0]);
1802 temp[0] = get_temp_reg(cs);
1803 /* FRC temp, src0
1804 * MAD dest, src0, 1.0, -temp
1805 */
1806 emit_arith(cs, PFS_OP_FRC, temp[0], mask,
1807 keep(src[0]), undef, undef, 0);
1808 emit_arith(cs, PFS_OP_MAD, dest, mask,
1809 src[0], pfs_one, negate(temp[0]), flags);
1810 free_temp(cs, temp[0]);
1811 break;
1812 case OPCODE_FRC:
1813 src[0] = t_src(cs, fpi->SrcReg[0]);
1814 emit_arith(cs, PFS_OP_FRC, dest, mask,
1815 src[0], undef, undef, flags);
1816 break;
1817 case OPCODE_KIL:
1818 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1819 break;
1820 case OPCODE_LG2:
1821 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1822 emit_arith(cs, PFS_OP_LG2, dest, mask,
1823 src[0], undef, undef, flags);
1824 break;
1825 case OPCODE_LIT:
1826 src[0] = t_src(cs, fpi->SrcReg[0]);
1827 emit_lit(cs, dest, mask, src[0], flags);
1828 break;
1829 case OPCODE_LRP:
1830 src[0] = t_src(cs, fpi->SrcReg[0]);
1831 src[1] = t_src(cs, fpi->SrcReg[1]);
1832 src[2] = t_src(cs, fpi->SrcReg[2]);
1833 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1834 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1835 * MAD temp, -tmp0, tmp2, tmp2
1836 * MAD result, tmp0, tmp1, temp
1837 */
1838 temp[0] = get_temp_reg(cs);
1839 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1840 negate(keep(src[0])), keep(src[2]), src[2],
1841 0);
1842 emit_arith(cs, PFS_OP_MAD, dest, mask,
1843 src[0], src[1], temp[0], flags);
1844 free_temp(cs, temp[0]);
1845 break;
1846 case OPCODE_MAD:
1847 src[0] = t_src(cs, fpi->SrcReg[0]);
1848 src[1] = t_src(cs, fpi->SrcReg[1]);
1849 src[2] = t_src(cs, fpi->SrcReg[2]);
1850 emit_arith(cs, PFS_OP_MAD, dest, mask,
1851 src[0], src[1], src[2], flags);
1852 break;
1853 case OPCODE_MAX:
1854 src[0] = t_src(cs, fpi->SrcReg[0]);
1855 src[1] = t_src(cs, fpi->SrcReg[1]);
1856 emit_arith(cs, PFS_OP_MAX, dest, mask,
1857 src[0], src[1], undef, flags);
1858 break;
1859 case OPCODE_MIN:
1860 src[0] = t_src(cs, fpi->SrcReg[0]);
1861 src[1] = t_src(cs, fpi->SrcReg[1]);
1862 emit_arith(cs, PFS_OP_MIN, dest, mask,
1863 src[0], src[1], undef, flags);
1864 break;
1865 case OPCODE_MOV:
1866 case OPCODE_SWZ:
1867 src[0] = t_src(cs, fpi->SrcReg[0]);
1868 emit_arith(cs, PFS_OP_MAD, dest, mask,
1869 src[0], pfs_one, pfs_zero, flags);
1870 break;
1871 case OPCODE_MUL:
1872 src[0] = t_src(cs, fpi->SrcReg[0]);
1873 src[1] = t_src(cs, fpi->SrcReg[1]);
1874 emit_arith(cs, PFS_OP_MAD, dest, mask,
1875 src[0], src[1], pfs_zero, flags);
1876 break;
1877 case OPCODE_POW:
1878 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1879 src[1] = t_scalar_src(cs, fpi->SrcReg[1]);
1880 temp[0] = get_temp_reg(cs);
1881 emit_arith(cs, PFS_OP_LG2, temp[0], WRITEMASK_W,
1882 src[0], undef, undef, 0);
1883 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1884 temp[0], src[1], pfs_zero, 0);
1885 emit_arith(cs, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1886 temp[0], undef, undef, 0);
1887 free_temp(cs, temp[0]);
1888 break;
1889 case OPCODE_RCP:
1890 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1891 emit_arith(cs, PFS_OP_RCP, dest, mask,
1892 src[0], undef, undef, flags);
1893 break;
1894 case OPCODE_RSQ:
1895 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1896 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1897 absolute(src[0]), pfs_zero, pfs_zero, flags);
1898 break;
1899 case OPCODE_SCS:
1900 /*
1901 * scs using a parabola :
1902 * scs(x):
1903 * result.x = sin(-abs(x)+0.5*PI) (cos)
1904 * result.y = sin(x) (sin)
1905 *
1906 */
1907 temp[0] = get_temp_reg(cs);
1908 temp[1] = get_temp_reg(cs);
1909 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1910 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1911 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1912
1913 /* x = -abs(x)+0.5*PI */
1914 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI
1915 pfs_half,
1916 negate(abs
1917 (swizzle(keep(src[0]), X, X, X, X))),
1918 0);
1919
1920 /* C*x (sin) */
1921 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1922 swizzle(const_sin[0], Y, Y, Y, Y),
1923 swizzle(keep(src[0]), X, X, X, X),
1924 pfs_zero, 0);
1925
1926 /* B*x, C*x (cos) */
1927 emit_arith(cs, PFS_OP_MAD, temp[0],
1928 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1929 Z, Z, Z,
1930 Z),
1931 const_sin[0], pfs_zero, 0);
1932
1933 /* B*x (sin) */
1934 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1935 swizzle(const_sin[0], X, X, X, X),
1936 keep(src[0]), pfs_zero, 0);
1937
1938 /* y = B*x + C*x*abs(x) (sin) */
1939 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1940 absolute(src[0]),
1941 swizzle(temp[0], W, W, W, W),
1942 swizzle(temp[1], W, W, W, W), 0);
1943
1944 /* y = B*x + C*x*abs(x) (cos) */
1945 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1946 swizzle(temp[0], Y, Y, Y, Y),
1947 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1948 swizzle(temp[0], X, X, X, X), 0);
1949
1950 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1951 emit_arith(cs, PFS_OP_MAD, temp[0],
1952 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1953 W, Z, Y,
1954 X),
1955 absolute(swizzle(temp[1], W, Z, Y, X)),
1956 negate(swizzle(temp[1], W, Z, Y, X)), 0);
1957
1958 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1959 emit_arith(cs, PFS_OP_MAD, dest,
1960 mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1961 swizzle(const_sin[0], W, W, W, W),
1962 swizzle(temp[1], W, Z, Y, X), flags);
1963
1964 free_temp(cs, temp[0]);
1965 free_temp(cs, temp[1]);
1966 break;
1967 case OPCODE_SGE:
1968 src[0] = t_src(cs, fpi->SrcReg[0]);
1969 src[1] = t_src(cs, fpi->SrcReg[1]);
1970 temp[0] = get_temp_reg(cs);
1971 /* temp = src0 - src1
1972 * dest.c = (temp.c < 0.0) ? 0 : 1
1973 */
1974 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1975 src[0], pfs_one, negate(src[1]), 0);
1976 emit_arith(cs, PFS_OP_CMP, dest, mask,
1977 pfs_one, pfs_zero, temp[0], 0);
1978 free_temp(cs, temp[0]);
1979 break;
1980 case OPCODE_SIN:
1981 /*
1982 * using a parabola:
1983 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1984 * extra precision is obtained by weighting against
1985 * itself squared.
1986 */
1987
1988 temp[0] = get_temp_reg(cs);
1989 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1990 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1991 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1992
1993 /* do range reduction */
1994
1995 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1996 swizzle(keep(src[0]), X, X, X, X),
1997 swizzle(const_sin[1], Z, Z, Z, Z),
1998 pfs_half, 0);
1999
2000 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
2001 swizzle(temp[0], X, X, X, X),
2002 undef, undef, 0);
2003
2004 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
2005 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
2006 0);
2007
2008 /* SIN */
2009
2010 emit_arith(cs, PFS_OP_MAD, temp[0],
2011 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
2012 Z, Z, Z,
2013 Z),
2014 const_sin[0], pfs_zero, 0);
2015
2016 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
2017 swizzle(temp[0], Y, Y, Y, Y),
2018 absolute(swizzle(temp[0], Z, Z, Z, Z)),
2019 swizzle(temp[0], X, X, X, X), 0);
2020
2021 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
2022 swizzle(temp[0], X, X, X, X),
2023 absolute(swizzle(temp[0], X, X, X, X)),
2024 negate(swizzle(temp[0], X, X, X, X)), 0);
2025
2026 emit_arith(cs, PFS_OP_MAD, dest, mask,
2027 swizzle(temp[0], Y, Y, Y, Y),
2028 swizzle(const_sin[0], W, W, W, W),
2029 swizzle(temp[0], X, X, X, X), flags);
2030
2031 free_temp(cs, temp[0]);
2032 break;
2033 case OPCODE_SLT:
2034 src[0] = t_src(cs, fpi->SrcReg[0]);
2035 src[1] = t_src(cs, fpi->SrcReg[1]);
2036 temp[0] = get_temp_reg(cs);
2037 /* temp = src0 - src1
2038 * dest.c = (temp.c < 0.0) ? 1 : 0
2039 */
2040 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
2041 src[0], pfs_one, negate(src[1]), 0);
2042 emit_arith(cs, PFS_OP_CMP, dest, mask,
2043 pfs_zero, pfs_one, temp[0], 0);
2044 free_temp(cs, temp[0]);
2045 break;
2046 case OPCODE_SUB:
2047 src[0] = t_src(cs, fpi->SrcReg[0]);
2048 src[1] = t_src(cs, fpi->SrcReg[1]);
2049 emit_arith(cs, PFS_OP_MAD, dest, mask,
2050 src[0], pfs_one, negate(src[1]), flags);
2051 break;
2052 case OPCODE_TEX:
2053 emit_tex(cs, fpi, R300_TEX_OP_LD);
2054 break;
2055 case OPCODE_TXB:
2056 emit_tex(cs, fpi, R300_TEX_OP_TXB);
2057 break;
2058 case OPCODE_TXP:
2059 emit_tex(cs, fpi, R300_TEX_OP_TXP);
2060 break;
2061 case OPCODE_XPD:{
2062 src[0] = t_src(cs, fpi->SrcReg[0]);
2063 src[1] = t_src(cs, fpi->SrcReg[1]);
2064 temp[0] = get_temp_reg(cs);
2065 /* temp = src0.zxy * src1.yzx */
2066 emit_arith(cs, PFS_OP_MAD, temp[0],
2067 WRITEMASK_XYZ, swizzle(keep(src[0]),
2068 Z, X, Y, W),
2069 swizzle(keep(src[1]), Y, Z, X, W),
2070 pfs_zero, 0);
2071 /* dest.xyz = src0.yzx * src1.zxy - temp
2072 * dest.w = undefined
2073 * */
2074 emit_arith(cs, PFS_OP_MAD, dest,
2075 mask & WRITEMASK_XYZ, swizzle(src[0],
2076 Y, Z,
2077 X, W),
2078 swizzle(src[1], Z, X, Y, W),
2079 negate(temp[0]), flags);
2080 /* cleanup */
2081 free_temp(cs, temp[0]);
2082 break;
2083 }
2084 default:
2085 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
2086 break;
2087 }
2088
2089 if (fp->error)
2090 return GL_FALSE;
2091
2092 }
2093
2094 return GL_TRUE;
2095 }
2096
2097 static void insert_wpos(struct gl_program *prog)
2098 {
2099 static gl_state_index tokens[STATE_LENGTH] = {
2100 STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
2101 };
2102 struct prog_instruction *fpi;
2103 GLuint window_index;
2104 int i = 0;
2105 GLuint tempregi = prog->NumTemporaries;
2106 /* should do something else if no temps left... */
2107 prog->NumTemporaries++;
2108
2109 fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
2110 _mesa_init_instructions(fpi, prog->NumInstructions + 3);
2111
2112 /* perspective divide */
2113 fpi[i].Opcode = OPCODE_RCP;
2114
2115 fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2116 fpi[i].DstReg.Index = tempregi;
2117 fpi[i].DstReg.WriteMask = WRITEMASK_W;
2118 fpi[i].DstReg.CondMask = COND_TR;
2119
2120 fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2121 fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2122 fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2123 i++;
2124
2125 fpi[i].Opcode = OPCODE_MUL;
2126
2127 fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2128 fpi[i].DstReg.Index = tempregi;
2129 fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2130 fpi[i].DstReg.CondMask = COND_TR;
2131
2132 fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2133 fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2134 fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2135
2136 fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2137 fpi[i].SrcReg[1].Index = tempregi;
2138 fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2139 i++;
2140
2141 /* viewport transformation */
2142 window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2143
2144 fpi[i].Opcode = OPCODE_MAD;
2145
2146 fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2147 fpi[i].DstReg.Index = tempregi;
2148 fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2149 fpi[i].DstReg.CondMask = COND_TR;
2150
2151 fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2152 fpi[i].SrcReg[0].Index = tempregi;
2153 fpi[i].SrcReg[0].Swizzle =
2154 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2155
2156 fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2157 fpi[i].SrcReg[1].Index = window_index;
2158 fpi[i].SrcReg[1].Swizzle =
2159 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2160
2161 fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2162 fpi[i].SrcReg[2].Index = window_index;
2163 fpi[i].SrcReg[2].Swizzle =
2164 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2165 i++;
2166
2167 _mesa_copy_instructions(&fpi[i], prog->Instructions,
2168 prog->NumInstructions);
2169
2170 free(prog->Instructions);
2171
2172 prog->Instructions = fpi;
2173
2174 prog->NumInstructions += i;
2175 fpi = &prog->Instructions[prog->NumInstructions - 1];
2176
2177 assert(fpi->Opcode == OPCODE_END);
2178
2179 for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
2180 for (i = 0; i < 3; i++)
2181 if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
2182 fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
2183 fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2184 fpi->SrcReg[i].Index = tempregi;
2185 }
2186 }
2187 }
2188
2189 /* - Init structures
2190 * - Determine what hwregs each input corresponds to
2191 */
2192 static void init_program(struct r300_pfs_compile_state *cs)
2193 {
2194 COMPILE_STATE;
2195 struct gl_fragment_program *mp = &fp->mesa_program;
2196 struct prog_instruction *fpi;
2197 GLuint InputsRead = mp->Base.InputsRead;
2198 GLuint temps_used = 0; /* for fp->temps[] */
2199 int i, j;
2200
2201 /* New compile, reset tracking data */
2202 fp->optimization =
2203 driQueryOptioni(&cs->r300->radeon.optionCache, "fp_optimization");
2204 fp->translated = GL_FALSE;
2205 fp->error = GL_FALSE;
2206 fp->WritesDepth = GL_FALSE;
2207 code->tex.length = 0;
2208 code->cur_node = 0;
2209 code->first_node_has_tex = 0;
2210 code->const_nr = 0;
2211 code->max_temp_idx = 0;
2212 code->node[0].alu_end = -1;
2213 code->node[0].tex_end = -1;
2214
2215 for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2216 for (j = 0; j < 3; j++) {
2217 cs->slot[i].vsrc[j] = SRC_CONST;
2218 cs->slot[i].ssrc[j] = SRC_CONST;
2219 }
2220 }
2221
2222 /* Work out what temps the Mesa inputs correspond to, this must match
2223 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2224 * configures itself based on the fragprog's InputsRead
2225 *
2226 * NOTE: this depends on get_hw_temp() allocating registers in order,
2227 * starting from register 0.
2228 */
2229
2230 /* Texcoords come first */
2231 for (i = 0; i < cs->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
2232 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2233 cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2234 cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2235 get_hw_temp(cs, 0);
2236 }
2237 }
2238 InputsRead &= ~FRAG_BITS_TEX_ANY;
2239
2240 /* fragment position treated as a texcoord */
2241 if (InputsRead & FRAG_BIT_WPOS) {
2242 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2243 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
2244 insert_wpos(&mp->Base);
2245 }
2246 InputsRead &= ~FRAG_BIT_WPOS;
2247
2248 /* Then primary colour */
2249 if (InputsRead & FRAG_BIT_COL0) {
2250 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2251 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
2252 }
2253 InputsRead &= ~FRAG_BIT_COL0;
2254
2255 /* Secondary color */
2256 if (InputsRead & FRAG_BIT_COL1) {
2257 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2258 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
2259 }
2260 InputsRead &= ~FRAG_BIT_COL1;
2261
2262 /* Anything else */
2263 if (InputsRead) {
2264 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2265 /* force read from hwreg 0 for now */
2266 for (i = 0; i < 32; i++)
2267 if (InputsRead & (1 << i))
2268 cs->inputs[i].reg = 0;
2269 }
2270
2271 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2272 * That way, we can free up the reg when it's no longer needed
2273 */
2274 if (!mp->Base.Instructions) {
2275 ERROR("No instructions found in program\n");
2276 return;
2277 }
2278
2279 for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
2280 int idx;
2281
2282 for (i = 0; i < 3; i++) {
2283 idx = fpi->SrcReg[i].Index;
2284 switch (fpi->SrcReg[i].File) {
2285 case PROGRAM_TEMPORARY:
2286 if (!(temps_used & (1 << idx))) {
2287 cs->temps[idx].reg = -1;
2288 cs->temps[idx].refcount = 1;
2289 temps_used |= (1 << idx);
2290 } else
2291 cs->temps[idx].refcount++;
2292 break;
2293 case PROGRAM_INPUT:
2294 cs->inputs[idx].refcount++;
2295 break;
2296 default:
2297 break;
2298 }
2299 }
2300
2301 idx = fpi->DstReg.Index;
2302 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2303 if (!(temps_used & (1 << idx))) {
2304 cs->temps[idx].reg = -1;
2305 cs->temps[idx].refcount = 1;
2306 temps_used |= (1 << idx);
2307 } else
2308 cs->temps[idx].refcount++;
2309 }
2310 }
2311 cs->temp_in_use = temps_used;
2312 }
2313
2314 static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
2315 {
2316 struct gl_fragment_program *mp = &fp->mesa_program;
2317
2318 /* Ask Mesa nicely to fill in ParameterValues for us */
2319 if (mp->Base.Parameters)
2320 _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
2321 }
2322
2323 void r300TranslateFragmentShader(r300ContextPtr r300,
2324 struct r300_fragment_program *fp)
2325 {
2326 if (!fp->translated) {
2327 struct r300_pfs_compile_state cs;
2328
2329 _mesa_memset(&cs, 0, sizeof(cs));
2330 cs.r300 = r300;
2331 cs.fp = fp;
2332 init_program(&cs);
2333
2334 if (parse_program(&cs) == GL_FALSE) {
2335 dump_program(fp, &fp->code);
2336 return;
2337 }
2338
2339 /* Finish off */
2340 fp->code.node[fp->code.cur_node].alu_end =
2341 cs.nrslots - fp->code.node[fp->code.cur_node].alu_offset - 1;
2342 if (fp->code.node[fp->code.cur_node].tex_end < 0)
2343 fp->code.node[fp->code.cur_node].tex_end = 0;
2344 fp->code.alu_offset = 0;
2345 fp->code.alu_end = cs.nrslots - 1;
2346 fp->code.tex_offset = 0;
2347 fp->code.tex_end = fp->code.tex.length ? fp->code.tex.length - 1 : 0;
2348 assert(fp->code.node[fp->code.cur_node].alu_end >= 0);
2349 assert(fp->code.alu_end >= 0);
2350
2351 fp->translated = GL_TRUE;
2352 if (RADEON_DEBUG & DEBUG_PIXEL)
2353 dump_program(fp, &fp->code);
2354 r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
2355 }
2356
2357 update_params(r300, fp);
2358 }
2359
2360 /* just some random things... */
2361 static void dump_program(struct r300_fragment_program *fp,
2362 struct r300_fragment_program_code *code)
2363 {
2364 int n, i, j;
2365 static int pc = 0;
2366
2367 fprintf(stderr, "pc=%d*************************************\n", pc++);
2368
2369 fprintf(stderr, "Mesa program:\n");
2370 fprintf(stderr, "-------------\n");
2371 _mesa_print_program(&fp->mesa_program.Base);
2372 fflush(stdout);
2373
2374 fprintf(stderr, "Hardware program\n");
2375 fprintf(stderr, "----------------\n");
2376
2377 for (n = 0; n < (code->cur_node + 1); n++) {
2378 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
2379 "alu_end: %d, tex_end: %d\n", n,
2380 code->node[n].alu_offset,
2381 code->node[n].tex_offset,
2382 code->node[n].alu_end, code->node[n].tex_end);
2383
2384 if (code->tex.length) {
2385 fprintf(stderr, " TEX:\n");
2386 for (i = code->node[n].tex_offset;
2387 i <= code->node[n].tex_offset + code->node[n].tex_end;
2388 ++i) {
2389 const char *instr;
2390
2391 switch ((code->tex.
2392 inst[i] >> R300_TEX_INST_SHIFT) &
2393 15) {
2394 case R300_TEX_OP_LD:
2395 instr = "TEX";
2396 break;
2397 case R300_TEX_OP_KIL:
2398 instr = "KIL";
2399 break;
2400 case R300_TEX_OP_TXP:
2401 instr = "TXP";
2402 break;
2403 case R300_TEX_OP_TXB:
2404 instr = "TXB";
2405 break;
2406 default:
2407 instr = "UNKNOWN";
2408 }
2409
2410 fprintf(stderr,
2411 " %s t%i, %c%i, texture[%i] (%08x)\n",
2412 instr,
2413 (code->tex.
2414 inst[i] >> R300_DST_ADDR_SHIFT) & 31,
2415 't',
2416 (code->tex.
2417 inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
2418 (code->tex.
2419 inst[i] & R300_TEX_ID_MASK) >>
2420 R300_TEX_ID_SHIFT,
2421 code->tex.inst[i]);
2422 }
2423 }
2424
2425 for (i = code->node[n].alu_offset;
2426 i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
2427 char srcc[3][10], dstc[20];
2428 char srca[3][10], dsta[20];
2429 char argc[3][20];
2430 char arga[3][20];
2431 char flags[5], tmp[10];
2432
2433 for (j = 0; j < 3; ++j) {
2434 int regc = code->alu.inst[i].inst1 >> (j * 6);
2435 int rega = code->alu.inst[i].inst3 >> (j * 6);
2436
2437 sprintf(srcc[j], "%c%i",
2438 (regc & 32) ? 'c' : 't', regc & 31);
2439 sprintf(srca[j], "%c%i",
2440 (rega & 32) ? 'c' : 't', rega & 31);
2441 }
2442
2443 dstc[0] = 0;
2444 sprintf(flags, "%s%s%s",
2445 (code->alu.inst[i].
2446 inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
2447 (code->alu.inst[i].
2448 inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
2449 (code->alu.inst[i].
2450 inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
2451 if (flags[0] != 0) {
2452 sprintf(dstc, "t%i.%s ",
2453 (code->alu.inst[i].
2454 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
2455 flags);
2456 }
2457 sprintf(flags, "%s%s%s",
2458 (code->alu.inst[i].
2459 inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
2460 (code->alu.inst[i].
2461 inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
2462 (code->alu.inst[i].
2463 inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
2464 if (flags[0] != 0) {
2465 sprintf(tmp, "o%i.%s",
2466 (code->alu.inst[i].
2467 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
2468 flags);
2469 strcat(dstc, tmp);
2470 }
2471
2472 dsta[0] = 0;
2473 if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
2474 sprintf(dsta, "t%i.w ",
2475 (code->alu.inst[i].
2476 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
2477 }
2478 if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
2479 sprintf(tmp, "o%i.w ",
2480 (code->alu.inst[i].
2481 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
2482 strcat(dsta, tmp);
2483 }
2484 if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
2485 strcat(dsta, "Z");
2486 }
2487
2488 fprintf(stderr,
2489 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2490 " w: %3s %3s %3s -> %-20s (%08x)\n", i,
2491 srcc[0], srcc[1], srcc[2], dstc,
2492 code->alu.inst[i].inst1, srca[0], srca[1],
2493 srca[2], dsta, code->alu.inst[i].inst3);
2494
2495 for (j = 0; j < 3; ++j) {
2496 int regc = code->alu.inst[i].inst0 >> (j * 7);
2497 int rega = code->alu.inst[i].inst2 >> (j * 7);
2498 int d;
2499 char buf[20];
2500
2501 d = regc & 31;
2502 if (d < 12) {
2503 switch (d % 4) {
2504 case R300_ALU_ARGC_SRC0C_XYZ:
2505 sprintf(buf, "%s.xyz",
2506 srcc[d / 4]);
2507 break;
2508 case R300_ALU_ARGC_SRC0C_XXX:
2509 sprintf(buf, "%s.xxx",
2510 srcc[d / 4]);
2511 break;
2512 case R300_ALU_ARGC_SRC0C_YYY:
2513 sprintf(buf, "%s.yyy",
2514 srcc[d / 4]);
2515 break;
2516 case R300_ALU_ARGC_SRC0C_ZZZ:
2517 sprintf(buf, "%s.zzz",
2518 srcc[d / 4]);
2519 break;
2520 }
2521 } else if (d < 15) {
2522 sprintf(buf, "%s.www", srca[d - 12]);
2523 } else if (d == 20) {
2524 sprintf(buf, "0.0");
2525 } else if (d == 21) {
2526 sprintf(buf, "1.0");
2527 } else if (d == 22) {
2528 sprintf(buf, "0.5");
2529 } else if (d >= 23 && d < 32) {
2530 d -= 23;
2531 switch (d / 3) {
2532 case 0:
2533 sprintf(buf, "%s.yzx",
2534 srcc[d % 3]);
2535 break;
2536 case 1:
2537 sprintf(buf, "%s.zxy",
2538 srcc[d % 3]);
2539 break;
2540 case 2:
2541 sprintf(buf, "%s.Wzy",
2542 srcc[d % 3]);
2543 break;
2544 }
2545 } else {
2546 sprintf(buf, "%i", d);
2547 }
2548
2549 sprintf(argc[j], "%s%s%s%s",
2550 (regc & 32) ? "-" : "",
2551 (regc & 64) ? "|" : "",
2552 buf, (regc & 64) ? "|" : "");
2553
2554 d = rega & 31;
2555 if (d < 9) {
2556 sprintf(buf, "%s.%c", srcc[d / 3],
2557 'x' + (char)(d % 3));
2558 } else if (d < 12) {
2559 sprintf(buf, "%s.w", srca[d - 9]);
2560 } else if (d == 16) {
2561 sprintf(buf, "0.0");
2562 } else if (d == 17) {
2563 sprintf(buf, "1.0");
2564 } else if (d == 18) {
2565 sprintf(buf, "0.5");
2566 } else {
2567 sprintf(buf, "%i", d);
2568 }
2569
2570 sprintf(arga[j], "%s%s%s%s",
2571 (rega & 32) ? "-" : "",
2572 (rega & 64) ? "|" : "",
2573 buf, (rega & 64) ? "|" : "");
2574 }
2575
2576 fprintf(stderr, " xyz: %8s %8s %8s op: %08x\n"
2577 " w: %8s %8s %8s op: %08x\n",
2578 argc[0], argc[1], argc[2],
2579 code->alu.inst[i].inst0, arga[0], arga[1],
2580 arga[2], code->alu.inst[i].inst2);
2581 }
2582 }
2583 }