r500: Set Saturate correctly in radeon_program_pair
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog_emit.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * \file
30 *
31 * Emit the r300_fragment_program_code that can be understood by the hardware.
32 * Input is a pre-transformed radeon_program.
33 *
34 * \author Ben Skeggs <darktama@iinet.net.au>
35 *
36 * \author Jerome Glisse <j.glisse@gmail.com>
37 *
38 * \todo FogOption
39 *
40 * \todo Verify results of opcodes for accuracy, I've only checked them in
41 * specific cases.
42 */
43
44 #include "glheader.h"
45 #include "macros.h"
46 #include "enums.h"
47 #include "shader/prog_instruction.h"
48 #include "shader/prog_parameter.h"
49 #include "shader/prog_print.h"
50
51 #include "r300_context.h"
52 #include "r300_fragprog.h"
53 #include "r300_reg.h"
54 #include "r300_state.h"
55
56 /* Mapping Mesa registers to R300 temporaries */
57 struct reg_acc {
58 int reg; /* Assigned hw temp */
59 unsigned int refcount; /* Number of uses by mesa program */
60 };
61
62 /**
63 * Describe the current lifetime information for an R300 temporary
64 */
65 struct reg_lifetime {
66 /* Index of the first slot where this register is free in the sense
67 that it can be used as a new destination register.
68 This is -1 if the register has been assigned to a Mesa register
69 and the last access to the register has not yet been emitted */
70 int free;
71
72 /* Index of the first slot where this register is currently reserved.
73 This is used to stop e.g. a scalar operation from being moved
74 before the allocation time of a register that was first allocated
75 for a vector operation. */
76 int reserved;
77
78 /* Index of the first slot in which the register can be used as a
79 source without losing the value that is written by the last
80 emitted instruction that writes to the register */
81 int vector_valid;
82 int scalar_valid;
83
84 /* Index to the slot where the register was last read.
85 This is also the first slot in which the register may be written again */
86 int vector_lastread;
87 int scalar_lastread;
88 };
89
90 /**
91 * Store usage information about an ALU instruction slot during the
92 * compilation of a fragment program.
93 */
94 #define SLOT_SRC_VECTOR (1<<0)
95 #define SLOT_SRC_SCALAR (1<<3)
96 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
97 #define SLOT_OP_VECTOR (1<<16)
98 #define SLOT_OP_SCALAR (1<<17)
99 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
100
101 struct r300_pfs_compile_slot {
102 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
103 defined above */
104 unsigned int used;
105
106 /* Selected sources */
107 int vsrc[3];
108 int ssrc[3];
109 };
110
111 /**
112 * Store information during compilation of fragment programs.
113 */
114 struct r300_pfs_compile_state {
115 struct r300_fragment_program_compiler *compiler;
116
117 int nrslots; /* number of ALU slots used so far */
118
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
121
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
124
125 /* Used to map Mesa's inputs/temps onto hardware temps */
126 int temp_in_use;
127 struct reg_acc temps[PFS_NUM_TEMP_REGS];
128 struct reg_acc inputs[32]; /* don't actually need 32... */
129
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
132 GLuint used_in_node;
133 GLuint dest_in_node;
134 };
135
136
137 /*
138 * Usefull macros and values
139 */
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
144 } while(0)
145
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->compiler->fp; \
149 struct r300_fragment_program_code *code = cs->compiler->code; \
150 (void)code; (void)fp
151
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
163
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
165 ((SWIZZLE_##x<<0)| \
166 (SWIZZLE_##y<<3)| \
167 (SWIZZLE_##z<<6)| \
168 (SWIZZLE_##w<<9)), \
169 0)
170
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
175
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
186
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
197
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
247
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
266
267
268 /*
269 * Datas structures for fragment program generation
270 */
271
272 /* description of r300 native hw instructions */
273 static const struct {
274 const char *name;
275 int argc;
276 int v_op;
277 int s_op;
278 } r300_fpop[] = {
279 /* *INDENT-OFF* */
280 {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
281 {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
282 {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
283 {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
284 {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
285 {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
286 {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
293 /* *INDENT-ON* */
294 };
295
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
298 *
299 * REG_VSWZ/REG_SSWZ is an index into this table
300 */
301
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
304
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
306 SWIZZLE_##y, \
307 SWIZZLE_##z, \
308 SWIZZLE_ZERO))
309 /* native swizzles */
310 static const struct r300_pfs_swizzle {
311 GLuint hash; /* swizzle value this matches */
312 GLuint base; /* base value for hw swizzle */
313 GLuint stride; /* difference in base between arg0/1/2 */
314 GLuint flags;
315 } v_swiz[] = {
316 /* *INDENT-OFF* */
317 {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
318 {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
319 {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
320 {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
321 {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
322 {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
323 {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
324 {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
325 {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
326 {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
327 {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
328 {PFS_INVAL, 0, 0, 0},
329 /* *INDENT-ON* */
330 };
331
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash; /* used to mask matching swizzle components */
339 int mask; /* actual outmask */
340 int count; /* count of components matched */
341 } s_mask[] = {
342 /* *INDENT-OFF* */
343 {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
344 {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
345 {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
346 {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
347 {SWZ_X_MASK, 1, 1},
348 {SWZ_Y_MASK, 2, 1},
349 {SWZ_Z_MASK, 4, 1},
350 {PFS_INVAL, PFS_INVAL, PFS_INVAL}
351 /* *INDENT-ON* */
352 };
353
354 static const struct {
355 int base; /* hw value of swizzle */
356 int stride; /* difference between SRC0/1/2 */
357 GLuint flags;
358 } s_swiz[] = {
359 /* *INDENT-OFF* */
360 {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
361 {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
362 {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
363 {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
364 {R300_ALU_ARGA_ZERO, 0, 0},
365 {R300_ALU_ARGA_ONE, 0, 0},
366 {R300_ALU_ARGA_HALF, 0, 0}
367 /* *INDENT-ON* */
368 };
369
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef = REG(REG_TYPE_TEMP,
372 0,
373 SWIZZLE_XYZ,
374 SWIZZLE_W,
375 GL_FALSE,
376 GL_FALSE,
377 GL_FALSE);
378
379 /* constant one source */
380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
381 0,
382 SWIZZLE_111,
383 SWIZZLE_ONE,
384 GL_FALSE,
385 GL_TRUE,
386 GL_TRUE);
387
388 /* constant half source */
389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
390 0,
391 SWIZZLE_HHH,
392 SWIZZLE_HALF,
393 GL_FALSE,
394 GL_TRUE,
395 GL_TRUE);
396
397 /* constant zero source */
398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
399 0,
400 SWIZZLE_000,
401 SWIZZLE_ZERO,
402 GL_FALSE,
403 GL_TRUE,
404 GL_TRUE);
405
406 /*
407 * Common functions prototypes
408 */
409 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
410 GLuint dest, int mask,
411 GLuint src0, GLuint src1, GLuint src2, int flags);
412
413 /**
414 * Get an R300 temporary that can be written to in the given slot.
415 */
416 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
417 {
418 COMPILE_STATE;
419 int r;
420
421 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
422 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
423 break;
424 }
425
426 if (r >= PFS_NUM_TEMP_REGS) {
427 ERROR("Out of hardware temps\n");
428 return 0;
429 }
430 // Reserved is used to avoid the following scenario:
431 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
432 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
433 // Then scalar ops on Mesa temporary Z are emitted and move back in time
434 // to overwrite the value of temporary Y.
435 // End scenario.
436 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
437 cs->hwtemps[r].free = -1;
438
439 // Reset to some value that won't mess things up when the user
440 // tries to read from a temporary that hasn't been assigned a value yet.
441 // In the normal case, vector_valid and scalar_valid should be set to
442 // a sane value by the first emit that writes to this temporary.
443 cs->hwtemps[r].vector_valid = 0;
444 cs->hwtemps[r].scalar_valid = 0;
445
446 if (r > code->max_temp_idx)
447 code->max_temp_idx = r;
448
449 return r;
450 }
451
452 /**
453 * Get an R300 temporary that will act as a TEX destination register.
454 */
455 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
456 {
457 COMPILE_STATE;
458 int r;
459
460 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
461 if (cs->used_in_node & (1 << r))
462 continue;
463
464 // Note: Be very careful here
465 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
466 break;
467 }
468
469 if (r >= PFS_NUM_TEMP_REGS)
470 return get_hw_temp(cs, 0); /* Will cause an indirection */
471
472 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
473 cs->hwtemps[r].free = -1;
474
475 // Reset to some value that won't mess things up when the user
476 // tries to read from a temporary that hasn't been assigned a value yet.
477 // In the normal case, vector_valid and scalar_valid should be set to
478 // a sane value by the first emit that writes to this temporary.
479 cs->hwtemps[r].vector_valid = cs->nrslots;
480 cs->hwtemps[r].scalar_valid = cs->nrslots;
481
482 if (r > code->max_temp_idx)
483 code->max_temp_idx = r;
484
485 return r;
486 }
487
488 /**
489 * Mark the given hardware register as free.
490 */
491 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
492 {
493 // Be very careful here. Consider sequences like
494 // MAD r0, r1,r2,r3
495 // TEX r4, ...
496 // The TEX instruction may be moved in front of the MAD instruction
497 // due to the way nodes work. We don't want to alias r1 and r4 in
498 // this case.
499 // I'm certain the register allocation could be further sanitized,
500 // but it's tricky because of stuff that can happen inside emit_tex
501 // and emit_arith.
502 cs->hwtemps[idx].free = cs->nrslots + 1;
503 }
504
505 /**
506 * Create a new Mesa temporary register.
507 */
508 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
509 {
510 COMPILE_STATE;
511 GLuint r = undef;
512 GLuint index;
513
514 index = ffs(~cs->temp_in_use);
515 if (!index) {
516 ERROR("Out of program temps\n");
517 return r;
518 }
519
520 cs->temp_in_use |= (1 << --index);
521 cs->temps[index].refcount = 0xFFFFFFFF;
522 cs->temps[index].reg = -1;
523
524 REG_SET_TYPE(r, REG_TYPE_TEMP);
525 REG_SET_INDEX(r, index);
526 REG_SET_VALID(r, GL_TRUE);
527 return r;
528 }
529
530 /**
531 * Free a Mesa temporary and the associated R300 temporary.
532 */
533 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
534 {
535 GLuint index = REG_GET_INDEX(r);
536
537 if (!(cs->temp_in_use & (1 << index)))
538 return;
539
540 if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
541 free_hw_temp(cs, cs->temps[index].reg);
542 cs->temps[index].reg = -1;
543 cs->temp_in_use &= ~(1 << index);
544 } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
545 free_hw_temp(cs, cs->inputs[index].reg);
546 cs->inputs[index].reg = -1;
547 }
548 }
549
550 /**
551 * Emit a hardware constant/parameter.
552 */
553 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
554 struct prog_src_register srcreg)
555 {
556 COMPILE_STATE;
557 GLuint reg = undef;
558 int index;
559
560 for (index = 0; index < code->const_nr; ++index) {
561 if (code->constant[index].File == srcreg.File &&
562 code->constant[index].Index == srcreg.Index)
563 break;
564 }
565
566 if (index >= code->const_nr) {
567 if (index >= PFS_NUM_CONST_REGS) {
568 ERROR("Out of hw constants!\n");
569 return reg;
570 }
571
572 code->const_nr++;
573 code->constant[index] = srcreg;
574 }
575
576 REG_SET_TYPE(reg, REG_TYPE_CONST);
577 REG_SET_INDEX(reg, index);
578 REG_SET_VALID(reg, GL_TRUE);
579 return reg;
580 }
581
582 static INLINE GLuint negate(GLuint r)
583 {
584 REG_NEGS(r);
585 REG_NEGV(r);
586 return r;
587 }
588
589 /* Hack, to prevent clobbering sources used multiple times when
590 * emulating non-native instructions
591 */
592 static INLINE GLuint keep(GLuint r)
593 {
594 REG_SET_NO_USE(r, GL_TRUE);
595 return r;
596 }
597
598 static INLINE GLuint absolute(GLuint r)
599 {
600 REG_ABS(r);
601 return r;
602 }
603
604 static int swz_native(struct r300_pfs_compile_state *cs,
605 GLuint src, GLuint * r, GLuint arbneg)
606 {
607 COMPILE_STATE;
608
609 /* Native swizzle, handle negation */
610 src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
611
612 if ((arbneg & 0x7) == 0x0) {
613 src = src & ~REG_NEGV_MASK;
614 *r = src;
615 } else if ((arbneg & 0x7) == 0x7) {
616 src |= REG_NEGV_MASK;
617 *r = src;
618 } else {
619 if (!REG_GET_VALID(*r))
620 *r = get_temp_reg(cs);
621 src |= REG_NEGV_MASK;
622 emit_arith(cs,
623 PFS_OP_MAD,
624 *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
625 src = src & ~REG_NEGV_MASK;
626 emit_arith(cs,
627 PFS_OP_MAD,
628 *r,
629 (arbneg ^ 0x7) | WRITEMASK_W,
630 src, pfs_one, pfs_zero, 0);
631 }
632
633 return 3;
634 }
635
636 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
637 GLuint src,
638 GLuint * r, int mask, int mc, GLuint arbneg)
639 {
640 COMPILE_STATE;
641 GLuint tmp;
642 GLuint wmask = 0;
643
644 if (!REG_GET_VALID(*r))
645 *r = get_temp_reg(cs);
646
647 /* A partial match, VSWZ/mask define what parts of the
648 * desired swizzle we match
649 */
650 if (mc + s_mask[mask].count == 3) {
651 wmask = WRITEMASK_W;
652 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
653 }
654
655 tmp = arbneg & s_mask[mask].mask;
656 if (tmp) {
657 tmp = tmp ^ s_mask[mask].mask;
658 if (tmp) {
659 emit_arith(cs,
660 PFS_OP_MAD,
661 *r,
662 arbneg & s_mask[mask].mask,
663 keep(src) | REG_NEGV_MASK,
664 pfs_one, pfs_zero, 0);
665 if (!wmask) {
666 REG_SET_NO_USE(src, GL_TRUE);
667 } else {
668 REG_SET_NO_USE(src, GL_FALSE);
669 }
670 emit_arith(cs,
671 PFS_OP_MAD,
672 *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
673 } else {
674 if (!wmask) {
675 REG_SET_NO_USE(src, GL_TRUE);
676 } else {
677 REG_SET_NO_USE(src, GL_FALSE);
678 }
679 emit_arith(cs,
680 PFS_OP_MAD,
681 *r,
682 (arbneg & s_mask[mask].mask) | wmask,
683 src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
684 }
685 } else {
686 if (!wmask) {
687 REG_SET_NO_USE(src, GL_TRUE);
688 } else {
689 REG_SET_NO_USE(src, GL_FALSE);
690 }
691 emit_arith(cs, PFS_OP_MAD,
692 *r,
693 s_mask[mask].mask | wmask,
694 src, pfs_one, pfs_zero, 0);
695 }
696
697 return s_mask[mask].count;
698 }
699
700 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
701 GLuint src, GLuint arbswz, GLuint arbneg)
702 {
703 COMPILE_STATE;
704 GLuint r = undef;
705 GLuint vswz;
706 int c_mask = 0;
707 int v_match = 0;
708
709 /* If swizzling from something without an XYZW native swizzle,
710 * emit result to a temp, and do new swizzle from the temp.
711 */
712 #if 0
713 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
714 GLuint temp = get_temp_reg(fp);
715 emit_arith(fp,
716 PFS_OP_MAD,
717 temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
718 src = temp;
719 }
720 #endif
721
722 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
723 GLuint vsrcswz =
724 (v_swiz[REG_GET_VSWZ(src)].
725 hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
726 REG_GET_SSWZ(src) << 9;
727 GLint i;
728
729 GLuint newswz = 0;
730 GLuint offset;
731 for (i = 0; i < 4; ++i) {
732 offset = GET_SWZ(arbswz, i);
733
734 newswz |=
735 (offset <= 3) ? GET_SWZ(vsrcswz,
736 offset) << i *
737 3 : offset << i * 3;
738 }
739
740 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
741 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
742 } else {
743 /* set scalar swizzling */
744 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
745
746 }
747 do {
748 vswz = REG_GET_VSWZ(src);
749 do {
750 int chash;
751
752 REG_SET_VSWZ(src, vswz);
753 chash = v_swiz[REG_GET_VSWZ(src)].hash &
754 s_mask[c_mask].hash;
755
756 if (chash == (arbswz & s_mask[c_mask].hash)) {
757 if (s_mask[c_mask].count == 3) {
758 v_match += swz_native(cs,
759 src, &r, arbneg);
760 } else {
761 v_match += swz_emit_partial(cs,
762 src,
763 &r,
764 c_mask,
765 v_match,
766 arbneg);
767 }
768
769 if (v_match == 3)
770 return r;
771
772 /* Fill with something invalid.. all 0's was
773 * wrong before, matched SWIZZLE_X. So all
774 * 1's will be okay for now
775 */
776 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
777 }
778 } while (v_swiz[++vswz].hash != PFS_INVAL);
779 REG_SET_VSWZ(src, SWIZZLE_XYZ);
780 } while (s_mask[++c_mask].hash != PFS_INVAL);
781
782 ERROR("should NEVER get here\n");
783 return r;
784 }
785
786 static GLuint t_src(struct r300_pfs_compile_state *cs,
787 struct prog_src_register fpsrc)
788 {
789 COMPILE_STATE;
790 GLuint r = undef;
791
792 switch (fpsrc.File) {
793 case PROGRAM_TEMPORARY:
794 REG_SET_INDEX(r, fpsrc.Index);
795 REG_SET_VALID(r, GL_TRUE);
796 REG_SET_TYPE(r, REG_TYPE_TEMP);
797 break;
798 case PROGRAM_INPUT:
799 REG_SET_INDEX(r, fpsrc.Index);
800 REG_SET_VALID(r, GL_TRUE);
801 REG_SET_TYPE(r, REG_TYPE_INPUT);
802 break;
803 case PROGRAM_LOCAL_PARAM:
804 case PROGRAM_ENV_PARAM:
805 case PROGRAM_STATE_VAR:
806 case PROGRAM_NAMED_PARAM:
807 case PROGRAM_CONSTANT:
808 r = emit_const4fv(cs, fpsrc);
809 break;
810 case PROGRAM_BUILTIN:
811 switch(fpsrc.Swizzle) {
812 case SWIZZLE_1111: r = pfs_one; break;
813 case SWIZZLE_0000: r = pfs_zero; break;
814 default:
815 ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
816 break;
817 }
818 break;
819 default:
820 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
821 return r;
822 }
823
824 /* no point swizzling ONE/ZERO/HALF constants... */
825 if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
826 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
827 if (fpsrc.Abs)
828 r = absolute(r);
829 if (fpsrc.NegateAbs)
830 r = negate(r);
831 return r;
832 }
833
834 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
835 struct prog_src_register fpsrc)
836 {
837 struct prog_src_register src = fpsrc;
838 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
839
840 src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
841
842 return t_src(cs, src);
843 }
844
845 static GLuint t_dst(struct r300_pfs_compile_state *cs,
846 struct prog_dst_register dest)
847 {
848 COMPILE_STATE;
849 GLuint r = undef;
850
851 switch (dest.File) {
852 case PROGRAM_TEMPORARY:
853 REG_SET_INDEX(r, dest.Index);
854 REG_SET_VALID(r, GL_TRUE);
855 REG_SET_TYPE(r, REG_TYPE_TEMP);
856 return r;
857 case PROGRAM_OUTPUT:
858 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
859 switch (dest.Index) {
860 case FRAG_RESULT_COLR:
861 case FRAG_RESULT_DEPR:
862 REG_SET_INDEX(r, dest.Index);
863 REG_SET_VALID(r, GL_TRUE);
864 return r;
865 default:
866 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
867 return r;
868 }
869 default:
870 ERROR("Bad DstReg->File 0x%x\n", dest.File);
871 return r;
872 }
873 }
874
875 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
876 {
877 COMPILE_STATE;
878 int idx;
879 int index = REG_GET_INDEX(src);
880
881 switch (REG_GET_TYPE(src)) {
882 case REG_TYPE_TEMP:
883 /* NOTE: if reg==-1 here, a source is being read that
884 * hasn't been written to. Undefined results.
885 */
886 if (cs->temps[index].reg == -1)
887 cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
888
889 idx = cs->temps[index].reg;
890
891 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
892 free_temp(cs, src);
893 break;
894 case REG_TYPE_INPUT:
895 idx = cs->inputs[index].reg;
896
897 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
898 free_hw_temp(cs, cs->inputs[index].reg);
899 break;
900 case REG_TYPE_CONST:
901 return (index | SRC_CONST);
902 default:
903 ERROR("Invalid type for source reg\n");
904 return (0 | SRC_CONST);
905 }
906
907 if (!tex)
908 cs->used_in_node |= (1 << idx);
909
910 return idx;
911 }
912
913 static int t_hw_dst(struct r300_pfs_compile_state *cs,
914 GLuint dest, GLboolean tex, int slot)
915 {
916 COMPILE_STATE;
917 int idx;
918 GLuint index = REG_GET_INDEX(dest);
919 assert(REG_GET_VALID(dest));
920
921 switch (REG_GET_TYPE(dest)) {
922 case REG_TYPE_TEMP:
923 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
924 if (!tex) {
925 cs->temps[index].reg = get_hw_temp(cs, slot);
926 } else {
927 cs->temps[index].reg = get_hw_temp_tex(cs);
928 }
929 }
930 idx = cs->temps[index].reg;
931
932 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
933 free_temp(cs, dest);
934
935 cs->dest_in_node |= (1 << idx);
936 cs->used_in_node |= (1 << idx);
937 break;
938 case REG_TYPE_OUTPUT:
939 switch (index) {
940 case FRAG_RESULT_COLR:
941 code->node[code->cur_node].flags |= R300_RGBA_OUT;
942 break;
943 case FRAG_RESULT_DEPR:
944 fp->WritesDepth = GL_TRUE;
945 code->node[code->cur_node].flags |= R300_W_OUT;
946 break;
947 }
948 return index;
949 break;
950 default:
951 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
952 return 0;
953 }
954
955 return idx;
956 }
957
958 static void emit_nop(struct r300_pfs_compile_state *cs)
959 {
960 COMPILE_STATE;
961
962 if (cs->nrslots >= PFS_MAX_ALU_INST) {
963 ERROR("Out of ALU instruction slots\n");
964 return;
965 }
966
967 code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
968 code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
969 code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
970 code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
971 cs->nrslots++;
972 }
973
974 static void emit_tex(struct r300_pfs_compile_state *cs,
975 struct prog_instruction *fpi, int opcode)
976 {
977 COMPILE_STATE;
978 GLuint coord = t_src(cs, fpi->SrcReg[0]);
979 GLuint dest = undef;
980 GLuint din, uin;
981 int unit = fpi->TexSrcUnit;
982 int hwsrc, hwdest;
983
984 /* Ensure correct node indirection */
985 uin = cs->used_in_node;
986 din = cs->dest_in_node;
987
988 /* Resolve source/dest to hardware registers */
989 hwsrc = t_hw_src(cs, coord, GL_TRUE);
990
991 if (opcode != R300_TEX_OP_KIL) {
992 dest = t_dst(cs, fpi->DstReg);
993
994 hwdest =
995 t_hw_dst(cs, dest, GL_TRUE,
996 code->node[code->cur_node].alu_offset);
997
998 /* Use a temp that hasn't been used in this node, rather
999 * than causing an indirection
1000 */
1001 if (uin & (1 << hwdest)) {
1002 free_hw_temp(cs, hwdest);
1003 hwdest = get_hw_temp_tex(cs);
1004 cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1005 }
1006 } else {
1007 hwdest = 0;
1008 unit = 0;
1009 }
1010
1011 /* Indirection if source has been written in this node, or if the
1012 * dest has been read/written in this node
1013 */
1014 if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1015 (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1016
1017 /* Finish off current node */
1018 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1019 emit_nop(cs);
1020
1021 code->node[code->cur_node].alu_end =
1022 cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1023 assert(code->node[code->cur_node].alu_end >= 0);
1024
1025 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1026 ERROR("too many levels of texture indirection\n");
1027 return;
1028 }
1029
1030 /* Start new node */
1031 code->node[code->cur_node].tex_offset = code->tex.length;
1032 code->node[code->cur_node].alu_offset = cs->nrslots;
1033 code->node[code->cur_node].tex_end = -1;
1034 code->node[code->cur_node].alu_end = -1;
1035 code->node[code->cur_node].flags = 0;
1036 cs->used_in_node = 0;
1037 cs->dest_in_node = 0;
1038 }
1039
1040 if (code->cur_node == 0)
1041 code->first_node_has_tex = 1;
1042
1043 code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1044 | (hwdest << R300_DST_ADDR_SHIFT)
1045 | (unit << R300_TEX_ID_SHIFT)
1046 | (opcode << R300_TEX_INST_SHIFT);
1047
1048 cs->dest_in_node |= (1 << hwdest);
1049 if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1050 cs->used_in_node |= (1 << hwsrc);
1051
1052 code->node[code->cur_node].tex_end++;
1053 }
1054
1055 /**
1056 * Returns the first slot where we could possibly allow writing to dest,
1057 * according to register allocation.
1058 */
1059 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1060 GLuint dest, int mask)
1061 {
1062 COMPILE_STATE;
1063 int idx;
1064 int pos;
1065 GLuint index = REG_GET_INDEX(dest);
1066 assert(REG_GET_VALID(dest));
1067
1068 switch (REG_GET_TYPE(dest)) {
1069 case REG_TYPE_TEMP:
1070 if (cs->temps[index].reg == -1)
1071 return 0;
1072
1073 idx = cs->temps[index].reg;
1074 break;
1075 case REG_TYPE_OUTPUT:
1076 return 0;
1077 default:
1078 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1079 return 0;
1080 }
1081
1082 pos = cs->hwtemps[idx].reserved;
1083 if (mask & WRITEMASK_XYZ) {
1084 if (pos < cs->hwtemps[idx].vector_lastread)
1085 pos = cs->hwtemps[idx].vector_lastread;
1086 }
1087 if (mask & WRITEMASK_W) {
1088 if (pos < cs->hwtemps[idx].scalar_lastread)
1089 pos = cs->hwtemps[idx].scalar_lastread;
1090 }
1091
1092 return pos;
1093 }
1094
1095 /**
1096 * Allocates a slot for an ALU instruction that can consist of
1097 * a vertex part or a scalar part or both.
1098 *
1099 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1100 * appropriate position (vector and/or scalar), and their positions are
1101 * recorded in the srcpos array.
1102 *
1103 * This function emits instruction code for the source fetch and the
1104 * argument selection. It does not emit instruction code for the
1105 * opcode or the destination selection.
1106 *
1107 * @return the index of the slot
1108 */
1109 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1110 GLboolean emit_vop,
1111 GLboolean emit_sop,
1112 int argc, GLuint * src, GLuint dest, int mask)
1113 {
1114 COMPILE_STATE;
1115 int hwsrc[3];
1116 int srcpos[3];
1117 unsigned int used;
1118 int tempused;
1119 int tempvsrc[3];
1120 int tempssrc[3];
1121 int pos;
1122 int regnr;
1123 int i, j;
1124
1125 // Determine instruction slots, whether sources are required on
1126 // vector or scalar side, and the smallest slot number where
1127 // all source registers are available
1128 used = 0;
1129 if (emit_vop)
1130 used |= SLOT_OP_VECTOR;
1131 if (emit_sop)
1132 used |= SLOT_OP_SCALAR;
1133
1134 pos = get_earliest_allowed_write(cs, dest, mask);
1135
1136 if (code->node[code->cur_node].alu_offset > pos)
1137 pos = code->node[code->cur_node].alu_offset;
1138 for (i = 0; i < argc; ++i) {
1139 if (!REG_GET_BUILTIN(src[i])) {
1140 if (emit_vop)
1141 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1142 if (emit_sop)
1143 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1144 }
1145
1146 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1147 regnr = hwsrc[i] & 31;
1148
1149 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1150 if (used & (SLOT_SRC_VECTOR << i)) {
1151 if (cs->hwtemps[regnr].vector_valid > pos)
1152 pos = cs->hwtemps[regnr].vector_valid;
1153 }
1154 if (used & (SLOT_SRC_SCALAR << i)) {
1155 if (cs->hwtemps[regnr].scalar_valid > pos)
1156 pos = cs->hwtemps[regnr].scalar_valid;
1157 }
1158 }
1159 }
1160
1161 // Find a slot that fits
1162 for (;; ++pos) {
1163 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1164 continue;
1165
1166 if (pos >= cs->nrslots) {
1167 if (cs->nrslots >= PFS_MAX_ALU_INST) {
1168 ERROR("Out of ALU instruction slots\n");
1169 return -1;
1170 }
1171
1172 code->alu.inst[pos].inst0 = NOP_INST0;
1173 code->alu.inst[pos].inst1 = NOP_INST1;
1174 code->alu.inst[pos].inst2 = NOP_INST2;
1175 code->alu.inst[pos].inst3 = NOP_INST3;
1176
1177 cs->nrslots++;
1178 }
1179 // Note: When we need both parts (vector and scalar) of a source,
1180 // we always try to put them into the same position. This makes the
1181 // code easier to read, and it is optimal (i.e. one doesn't gain
1182 // anything by splitting the parts).
1183 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1184 tempused = cs->slot[pos].used;
1185 for (i = 0; i < 3; ++i) {
1186 tempvsrc[i] = cs->slot[pos].vsrc[i];
1187 tempssrc[i] = cs->slot[pos].ssrc[i];
1188 }
1189
1190 for (i = 0; i < argc; ++i) {
1191 int flags = (used >> i) & SLOT_SRC_BOTH;
1192
1193 if (!flags) {
1194 srcpos[i] = 0;
1195 continue;
1196 }
1197
1198 for (j = 0; j < 3; ++j) {
1199 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1200 if (tempvsrc[j] != hwsrc[i])
1201 continue;
1202 }
1203
1204 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1205 if (tempssrc[j] != hwsrc[i])
1206 continue;
1207 }
1208
1209 break;
1210 }
1211
1212 if (j == 3)
1213 break;
1214
1215 srcpos[i] = j;
1216 tempused |= flags << j;
1217 if (flags & SLOT_SRC_VECTOR)
1218 tempvsrc[j] = hwsrc[i];
1219 if (flags & SLOT_SRC_SCALAR)
1220 tempssrc[j] = hwsrc[i];
1221 }
1222
1223 if (i == argc)
1224 break;
1225 }
1226
1227 // Found a slot, reserve it
1228 cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1229 for (i = 0; i < 3; ++i) {
1230 cs->slot[pos].vsrc[i] = tempvsrc[i];
1231 cs->slot[pos].ssrc[i] = tempssrc[i];
1232 }
1233
1234 for (i = 0; i < argc; ++i) {
1235 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1236 int regnr = hwsrc[i] & 31;
1237
1238 if (used & (SLOT_SRC_VECTOR << i)) {
1239 if (cs->hwtemps[regnr].vector_lastread < pos)
1240 cs->hwtemps[regnr].vector_lastread =
1241 pos;
1242 }
1243 if (used & (SLOT_SRC_SCALAR << i)) {
1244 if (cs->hwtemps[regnr].scalar_lastread < pos)
1245 cs->hwtemps[regnr].scalar_lastread =
1246 pos;
1247 }
1248 }
1249 }
1250
1251 // Emit the source fetch code
1252 code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1253 code->alu.inst[pos].inst1 |=
1254 ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1255 (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1256 (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1257
1258 code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1259 code->alu.inst[pos].inst3 |=
1260 ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1261 (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1262 (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1263
1264 // Emit the argument selection code
1265 if (emit_vop) {
1266 int swz[3];
1267
1268 for (i = 0; i < 3; ++i) {
1269 if (i < argc) {
1270 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1271 (srcpos[i] *
1272 v_swiz[REG_GET_VSWZ(src[i])].
1273 stride)) | ((src[i] & REG_NEGV_MASK)
1274 ? ARG_NEG : 0) | ((src[i]
1275 &
1276 REG_ABS_MASK)
1277 ?
1278 ARG_ABS
1279 : 0);
1280 } else {
1281 swz[i] = R300_ALU_ARGC_ZERO;
1282 }
1283 }
1284
1285 code->alu.inst[pos].inst0 &=
1286 ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1287 R300_ALU_ARG2C_MASK);
1288 code->alu.inst[pos].inst0 |=
1289 (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1290 R300_ALU_ARG1C_SHIFT)
1291 | (swz[2] << R300_ALU_ARG2C_SHIFT);
1292 }
1293
1294 if (emit_sop) {
1295 int swz[3];
1296
1297 for (i = 0; i < 3; ++i) {
1298 if (i < argc) {
1299 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1300 (srcpos[i] *
1301 s_swiz[REG_GET_SSWZ(src[i])].
1302 stride)) | ((src[i] & REG_NEGS_MASK)
1303 ? ARG_NEG : 0) | ((src[i]
1304 &
1305 REG_ABS_MASK)
1306 ?
1307 ARG_ABS
1308 : 0);
1309 } else {
1310 swz[i] = R300_ALU_ARGA_ZERO;
1311 }
1312 }
1313
1314 code->alu.inst[pos].inst2 &=
1315 ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1316 R300_ALU_ARG2A_MASK);
1317 code->alu.inst[pos].inst2 |=
1318 (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1319 R300_ALU_ARG1A_SHIFT)
1320 | (swz[2] << R300_ALU_ARG2A_SHIFT);
1321 }
1322
1323 return pos;
1324 }
1325
1326 /**
1327 * Append an ALU instruction to the instruction list.
1328 */
1329 static void emit_arith(struct r300_pfs_compile_state *cs,
1330 int op,
1331 GLuint dest,
1332 int mask,
1333 GLuint src0, GLuint src1, GLuint src2, int flags)
1334 {
1335 COMPILE_STATE;
1336 GLuint src[3] = { src0, src1, src2 };
1337 int hwdest;
1338 GLboolean emit_vop, emit_sop;
1339 int vop, sop, argc;
1340 int pos;
1341
1342 vop = r300_fpop[op].v_op;
1343 sop = r300_fpop[op].s_op;
1344 argc = r300_fpop[op].argc;
1345
1346 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1347 REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1348 if (mask & WRITEMASK_Z) {
1349 mask = WRITEMASK_W;
1350 } else {
1351 return;
1352 }
1353 }
1354
1355 emit_vop = GL_FALSE;
1356 emit_sop = GL_FALSE;
1357 if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1358 emit_vop = GL_TRUE;
1359 if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1360 emit_sop = GL_TRUE;
1361
1362 pos =
1363 find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1364 mask);
1365 if (pos < 0)
1366 return;
1367
1368 hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1369
1370 if (flags & PFS_FLAG_SAT) {
1371 vop |= R300_ALU_OUTC_CLAMP;
1372 sop |= R300_ALU_OUTA_CLAMP;
1373 }
1374
1375 /* Throw the pieces together and get ALU/1 */
1376 if (emit_vop) {
1377 code->alu.inst[pos].inst0 |= vop;
1378
1379 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1380
1381 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1382 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1383 code->alu.inst[pos].inst1 |=
1384 (mask & WRITEMASK_XYZ) <<
1385 R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1386 } else
1387 assert(0);
1388 } else {
1389 code->alu.inst[pos].inst1 |=
1390 (mask & WRITEMASK_XYZ) <<
1391 R300_ALU_DSTC_REG_MASK_SHIFT;
1392
1393 cs->hwtemps[hwdest].vector_valid = pos + 1;
1394 }
1395 }
1396
1397 /* And now ALU/3 */
1398 if (emit_sop) {
1399 code->alu.inst[pos].inst2 |= sop;
1400
1401 if (mask & WRITEMASK_W) {
1402 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1403 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1404 code->alu.inst[pos].inst3 |=
1405 (hwdest << R300_ALU_DSTA_SHIFT) |
1406 R300_ALU_DSTA_OUTPUT;
1407 } else if (REG_GET_INDEX(dest) ==
1408 FRAG_RESULT_DEPR) {
1409 code->alu.inst[pos].inst3 |=
1410 R300_ALU_DSTA_DEPTH;
1411 } else
1412 assert(0);
1413 } else {
1414 code->alu.inst[pos].inst3 |=
1415 (hwdest << R300_ALU_DSTA_SHIFT) |
1416 R300_ALU_DSTA_REG;
1417
1418 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1419 }
1420 }
1421 }
1422
1423 return;
1424 }
1425
1426 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
1427 {
1428 COMPILE_STATE;
1429 GLuint src[3], dest;
1430 int flags, mask = 0;
1431
1432 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1433 flags = PFS_FLAG_SAT;
1434 else
1435 flags = 0;
1436
1437 if (fpi->Opcode != OPCODE_KIL) {
1438 dest = t_dst(cs, fpi->DstReg);
1439 mask = fpi->DstReg.WriteMask;
1440 }
1441
1442 switch (fpi->Opcode) {
1443 case OPCODE_ADD:
1444 src[0] = t_src(cs, fpi->SrcReg[0]);
1445 src[1] = t_src(cs, fpi->SrcReg[1]);
1446 emit_arith(cs, PFS_OP_MAD, dest, mask,
1447 src[0], pfs_one, src[1], flags);
1448 break;
1449 case OPCODE_CMP:
1450 src[0] = t_src(cs, fpi->SrcReg[0]);
1451 src[1] = t_src(cs, fpi->SrcReg[1]);
1452 src[2] = t_src(cs, fpi->SrcReg[2]);
1453 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1454 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1455 */
1456 emit_arith(cs, PFS_OP_CMP, dest, mask,
1457 src[2], src[1], src[0], flags);
1458 break;
1459 case OPCODE_DP3:
1460 src[0] = t_src(cs, fpi->SrcReg[0]);
1461 src[1] = t_src(cs, fpi->SrcReg[1]);
1462 emit_arith(cs, PFS_OP_DP3, dest, mask,
1463 src[0], src[1], undef, flags);
1464 break;
1465 case OPCODE_DP4:
1466 src[0] = t_src(cs, fpi->SrcReg[0]);
1467 src[1] = t_src(cs, fpi->SrcReg[1]);
1468 emit_arith(cs, PFS_OP_DP4, dest, mask,
1469 src[0], src[1], undef, flags);
1470 break;
1471 case OPCODE_EX2:
1472 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1473 emit_arith(cs, PFS_OP_EX2, dest, mask,
1474 src[0], undef, undef, flags);
1475 break;
1476 case OPCODE_FRC:
1477 src[0] = t_src(cs, fpi->SrcReg[0]);
1478 emit_arith(cs, PFS_OP_FRC, dest, mask,
1479 src[0], undef, undef, flags);
1480 break;
1481 case OPCODE_KIL:
1482 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1483 break;
1484 case OPCODE_LG2:
1485 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1486 emit_arith(cs, PFS_OP_LG2, dest, mask,
1487 src[0], undef, undef, flags);
1488 break;
1489 case OPCODE_MAD:
1490 src[0] = t_src(cs, fpi->SrcReg[0]);
1491 src[1] = t_src(cs, fpi->SrcReg[1]);
1492 src[2] = t_src(cs, fpi->SrcReg[2]);
1493 emit_arith(cs, PFS_OP_MAD, dest, mask,
1494 src[0], src[1], src[2], flags);
1495 break;
1496 case OPCODE_MAX:
1497 src[0] = t_src(cs, fpi->SrcReg[0]);
1498 src[1] = t_src(cs, fpi->SrcReg[1]);
1499 emit_arith(cs, PFS_OP_MAX, dest, mask,
1500 src[0], src[1], undef, flags);
1501 break;
1502 case OPCODE_MIN:
1503 src[0] = t_src(cs, fpi->SrcReg[0]);
1504 src[1] = t_src(cs, fpi->SrcReg[1]);
1505 emit_arith(cs, PFS_OP_MIN, dest, mask,
1506 src[0], src[1], undef, flags);
1507 break;
1508 case OPCODE_MOV:
1509 src[0] = t_src(cs, fpi->SrcReg[0]);
1510 emit_arith(cs, PFS_OP_MAD, dest, mask,
1511 src[0], pfs_one, pfs_zero, flags);
1512 break;
1513 case OPCODE_MUL:
1514 src[0] = t_src(cs, fpi->SrcReg[0]);
1515 src[1] = t_src(cs, fpi->SrcReg[1]);
1516 emit_arith(cs, PFS_OP_MAD, dest, mask,
1517 src[0], src[1], pfs_zero, flags);
1518 break;
1519 case OPCODE_RCP:
1520 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1521 emit_arith(cs, PFS_OP_RCP, dest, mask,
1522 src[0], undef, undef, flags);
1523 break;
1524 case OPCODE_RSQ:
1525 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1526 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1527 absolute(src[0]), pfs_zero, pfs_zero, flags);
1528 break;
1529 case OPCODE_TEX:
1530 emit_tex(cs, fpi, R300_TEX_OP_LD);
1531 break;
1532 case OPCODE_TXB:
1533 emit_tex(cs, fpi, R300_TEX_OP_TXB);
1534 break;
1535 case OPCODE_TXP:
1536 emit_tex(cs, fpi, R300_TEX_OP_TXP);
1537 break;
1538 default:
1539 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1540 break;
1541 }
1542 }
1543
1544 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1545 {
1546 COMPILE_STATE;
1547 struct prog_instruction* fpi;
1548
1549 for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) {
1550 emit_instruction(cs, fpi);
1551
1552 if (fp->error)
1553 return GL_FALSE;
1554 }
1555
1556 return GL_TRUE;
1557 }
1558
1559
1560 /* - Init structures
1561 * - Determine what hwregs each input corresponds to
1562 */
1563 static void init_program(struct r300_pfs_compile_state *cs)
1564 {
1565 COMPILE_STATE;
1566 struct gl_fragment_program *mp = &fp->mesa_program;
1567 GLuint InputsRead = mp->Base.InputsRead;
1568 GLuint temps_used = 0; /* for fp->temps[] */
1569 int i, j;
1570
1571 /* New compile, reset tracking data */
1572 fp->optimization =
1573 driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
1574 fp->translated = GL_FALSE;
1575 fp->error = GL_FALSE;
1576 fp->WritesDepth = GL_FALSE;
1577 code->tex.length = 0;
1578 code->cur_node = 0;
1579 code->first_node_has_tex = 0;
1580 code->const_nr = 0;
1581 code->max_temp_idx = 0;
1582 code->node[0].alu_end = -1;
1583 code->node[0].tex_end = -1;
1584
1585 for (i = 0; i < PFS_MAX_ALU_INST; i++) {
1586 for (j = 0; j < 3; j++) {
1587 cs->slot[i].vsrc[j] = SRC_CONST;
1588 cs->slot[i].ssrc[j] = SRC_CONST;
1589 }
1590 }
1591
1592 /* Work out what temps the Mesa inputs correspond to, this must match
1593 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1594 * configures itself based on the fragprog's InputsRead
1595 *
1596 * NOTE: this depends on get_hw_temp() allocating registers in order,
1597 * starting from register 0.
1598 */
1599
1600 /* Texcoords come first */
1601 for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
1602 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
1603 cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
1604 cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
1605 get_hw_temp(cs, 0);
1606 }
1607 }
1608 InputsRead &= ~FRAG_BITS_TEX_ANY;
1609
1610 /* fragment position treated as a texcoord */
1611 if (InputsRead & FRAG_BIT_WPOS) {
1612 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
1613 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
1614 }
1615 InputsRead &= ~FRAG_BIT_WPOS;
1616
1617 /* Then primary colour */
1618 if (InputsRead & FRAG_BIT_COL0) {
1619 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
1620 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
1621 }
1622 InputsRead &= ~FRAG_BIT_COL0;
1623
1624 /* Secondary color */
1625 if (InputsRead & FRAG_BIT_COL1) {
1626 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
1627 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
1628 }
1629 InputsRead &= ~FRAG_BIT_COL1;
1630
1631 /* Anything else */
1632 if (InputsRead) {
1633 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
1634 /* force read from hwreg 0 for now */
1635 for (i = 0; i < 32; i++)
1636 if (InputsRead & (1 << i))
1637 cs->inputs[i].reg = 0;
1638 }
1639
1640 /* Pre-parse the program, grabbing refcounts on input/temp regs.
1641 * That way, we can free up the reg when it's no longer needed
1642 */
1643 for (i = 0; i < cs->compiler->program->NumInstructions; ++i) {
1644 struct prog_instruction *fpi = cs->compiler->program->Instructions + i;
1645 int idx;
1646
1647 for (j = 0; j < 3; j++) {
1648 idx = fpi->SrcReg[j].Index;
1649 switch (fpi->SrcReg[j].File) {
1650 case PROGRAM_TEMPORARY:
1651 if (!(temps_used & (1 << idx))) {
1652 cs->temps[idx].reg = -1;
1653 cs->temps[idx].refcount = 1;
1654 temps_used |= (1 << idx);
1655 } else
1656 cs->temps[idx].refcount++;
1657 break;
1658 case PROGRAM_INPUT:
1659 cs->inputs[idx].refcount++;
1660 break;
1661 default:
1662 break;
1663 }
1664 }
1665
1666 idx = fpi->DstReg.Index;
1667 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
1668 if (!(temps_used & (1 << idx))) {
1669 cs->temps[idx].reg = -1;
1670 cs->temps[idx].refcount = 1;
1671 temps_used |= (1 << idx);
1672 } else
1673 cs->temps[idx].refcount++;
1674 }
1675 }
1676 cs->temp_in_use = temps_used;
1677 }
1678
1679
1680 /**
1681 * Final compilation step: Turn the intermediate radeon_program into
1682 * machine-readable instructions.
1683 */
1684 GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
1685 {
1686 struct r300_pfs_compile_state cs;
1687 struct r300_fragment_program_code *code = compiler->code;
1688
1689 _mesa_memset(&cs, 0, sizeof(cs));
1690 cs.compiler = compiler;
1691 init_program(&cs);
1692
1693 if (!parse_program(&cs))
1694 return GL_FALSE;
1695
1696 /* Finish off */
1697 code->node[code->cur_node].alu_end =
1698 cs.nrslots - code->node[code->cur_node].alu_offset - 1;
1699 if (code->node[code->cur_node].tex_end < 0)
1700 code->node[code->cur_node].tex_end = 0;
1701 code->alu_offset = 0;
1702 code->alu_end = cs.nrslots - 1;
1703 code->tex_offset = 0;
1704 code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
1705 assert(code->node[code->cur_node].alu_end >= 0);
1706 assert(code->alu_end >= 0);
1707
1708 return GL_TRUE;
1709 }
1710