r300_fragprog: Refactor TEX transformation
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog_emit.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * \file
30 *
31 * Emit the r300_fragment_program_code that can be understood by the hardware.
32 * Input is a pre-transformed radeon_program.
33 *
34 * \author Ben Skeggs <darktama@iinet.net.au>
35 *
36 * \author Jerome Glisse <j.glisse@gmail.com>
37 *
38 * \todo FogOption
39 *
40 * \todo Verify results of opcodes for accuracy, I've only checked them in
41 * specific cases.
42 */
43
44 #include "glheader.h"
45 #include "macros.h"
46 #include "enums.h"
47 #include "shader/prog_instruction.h"
48 #include "shader/prog_parameter.h"
49 #include "shader/prog_print.h"
50
51 #include "r300_context.h"
52 #include "r300_fragprog.h"
53 #include "r300_reg.h"
54 #include "r300_state.h"
55
56 /* Mapping Mesa registers to R300 temporaries */
57 struct reg_acc {
58 int reg; /* Assigned hw temp */
59 unsigned int refcount; /* Number of uses by mesa program */
60 };
61
62 /**
63 * Describe the current lifetime information for an R300 temporary
64 */
65 struct reg_lifetime {
66 /* Index of the first slot where this register is free in the sense
67 that it can be used as a new destination register.
68 This is -1 if the register has been assigned to a Mesa register
69 and the last access to the register has not yet been emitted */
70 int free;
71
72 /* Index of the first slot where this register is currently reserved.
73 This is used to stop e.g. a scalar operation from being moved
74 before the allocation time of a register that was first allocated
75 for a vector operation. */
76 int reserved;
77
78 /* Index of the first slot in which the register can be used as a
79 source without losing the value that is written by the last
80 emitted instruction that writes to the register */
81 int vector_valid;
82 int scalar_valid;
83
84 /* Index to the slot where the register was last read.
85 This is also the first slot in which the register may be written again */
86 int vector_lastread;
87 int scalar_lastread;
88 };
89
90 /**
91 * Store usage information about an ALU instruction slot during the
92 * compilation of a fragment program.
93 */
94 #define SLOT_SRC_VECTOR (1<<0)
95 #define SLOT_SRC_SCALAR (1<<3)
96 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
97 #define SLOT_OP_VECTOR (1<<16)
98 #define SLOT_OP_SCALAR (1<<17)
99 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
100
101 struct r300_pfs_compile_slot {
102 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
103 defined above */
104 unsigned int used;
105
106 /* Selected sources */
107 int vsrc[3];
108 int ssrc[3];
109 };
110
111 /**
112 * Store information during compilation of fragment programs.
113 */
114 struct r300_pfs_compile_state {
115 struct r300_fragment_program_compiler *compiler;
116
117 int nrslots; /* number of ALU slots used so far */
118
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
121
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
124
125 /* Used to map Mesa's inputs/temps onto hardware temps */
126 int temp_in_use;
127 struct reg_acc temps[PFS_NUM_TEMP_REGS];
128 struct reg_acc inputs[32]; /* don't actually need 32... */
129
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
132 GLuint used_in_node;
133 GLuint dest_in_node;
134 };
135
136
137 /*
138 * Usefull macros and values
139 */
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
144 } while(0)
145
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->compiler->fp; \
149 struct r300_fragment_program_code *code = cs->compiler->code; \
150 (void)code; (void)fp
151
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
163
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
165 ((SWIZZLE_##x<<0)| \
166 (SWIZZLE_##y<<3)| \
167 (SWIZZLE_##z<<6)| \
168 (SWIZZLE_##w<<9)), \
169 0)
170
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
175
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
186
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
197
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
247
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
266
267
268 /*
269 * Datas structures for fragment program generation
270 */
271
272 /* description of r300 native hw instructions */
273 static const struct {
274 const char *name;
275 int argc;
276 int v_op;
277 int s_op;
278 } r300_fpop[] = {
279 /* *INDENT-OFF* */
280 {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
281 {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
282 {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
283 {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
284 {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
285 {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
286 {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
293 /* *INDENT-ON* */
294 };
295
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
298 *
299 * REG_VSWZ/REG_SSWZ is an index into this table
300 */
301
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
304
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
306 SWIZZLE_##y, \
307 SWIZZLE_##z, \
308 SWIZZLE_ZERO))
309 /* native swizzles */
310 static const struct r300_pfs_swizzle {
311 GLuint hash; /* swizzle value this matches */
312 GLuint base; /* base value for hw swizzle */
313 GLuint stride; /* difference in base between arg0/1/2 */
314 GLuint flags;
315 } v_swiz[] = {
316 /* *INDENT-OFF* */
317 {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
318 {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
319 {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
320 {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
321 {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
322 {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
323 {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
324 {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
325 {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
326 {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
327 {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
328 {PFS_INVAL, 0, 0, 0},
329 /* *INDENT-ON* */
330 };
331
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash; /* used to mask matching swizzle components */
339 int mask; /* actual outmask */
340 int count; /* count of components matched */
341 } s_mask[] = {
342 /* *INDENT-OFF* */
343 {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
344 {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
345 {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
346 {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
347 {SWZ_X_MASK, 1, 1},
348 {SWZ_Y_MASK, 2, 1},
349 {SWZ_Z_MASK, 4, 1},
350 {PFS_INVAL, PFS_INVAL, PFS_INVAL}
351 /* *INDENT-ON* */
352 };
353
354 static const struct {
355 int base; /* hw value of swizzle */
356 int stride; /* difference between SRC0/1/2 */
357 GLuint flags;
358 } s_swiz[] = {
359 /* *INDENT-OFF* */
360 {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
361 {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
362 {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
363 {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
364 {R300_ALU_ARGA_ZERO, 0, 0},
365 {R300_ALU_ARGA_ONE, 0, 0},
366 {R300_ALU_ARGA_HALF, 0, 0}
367 /* *INDENT-ON* */
368 };
369
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef = REG(REG_TYPE_TEMP,
372 0,
373 SWIZZLE_XYZ,
374 SWIZZLE_W,
375 GL_FALSE,
376 GL_FALSE,
377 GL_FALSE);
378
379 /* constant one source */
380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
381 0,
382 SWIZZLE_111,
383 SWIZZLE_ONE,
384 GL_FALSE,
385 GL_TRUE,
386 GL_TRUE);
387
388 /* constant half source */
389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
390 0,
391 SWIZZLE_HHH,
392 SWIZZLE_HALF,
393 GL_FALSE,
394 GL_TRUE,
395 GL_TRUE);
396
397 /* constant zero source */
398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
399 0,
400 SWIZZLE_000,
401 SWIZZLE_ZERO,
402 GL_FALSE,
403 GL_TRUE,
404 GL_TRUE);
405
406 /*
407 * Common functions prototypes
408 */
409 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
410 GLuint dest, int mask,
411 GLuint src0, GLuint src1, GLuint src2, int flags);
412
413 /**
414 * Get an R300 temporary that can be written to in the given slot.
415 */
416 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
417 {
418 COMPILE_STATE;
419 int r;
420
421 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
422 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
423 break;
424 }
425
426 if (r >= PFS_NUM_TEMP_REGS) {
427 ERROR("Out of hardware temps\n");
428 return 0;
429 }
430 // Reserved is used to avoid the following scenario:
431 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
432 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
433 // Then scalar ops on Mesa temporary Z are emitted and move back in time
434 // to overwrite the value of temporary Y.
435 // End scenario.
436 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
437 cs->hwtemps[r].free = -1;
438
439 // Reset to some value that won't mess things up when the user
440 // tries to read from a temporary that hasn't been assigned a value yet.
441 // In the normal case, vector_valid and scalar_valid should be set to
442 // a sane value by the first emit that writes to this temporary.
443 cs->hwtemps[r].vector_valid = 0;
444 cs->hwtemps[r].scalar_valid = 0;
445
446 if (r > code->max_temp_idx)
447 code->max_temp_idx = r;
448
449 return r;
450 }
451
452 /**
453 * Get an R300 temporary that will act as a TEX destination register.
454 */
455 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
456 {
457 COMPILE_STATE;
458 int r;
459
460 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
461 if (cs->used_in_node & (1 << r))
462 continue;
463
464 // Note: Be very careful here
465 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
466 break;
467 }
468
469 if (r >= PFS_NUM_TEMP_REGS)
470 return get_hw_temp(cs, 0); /* Will cause an indirection */
471
472 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
473 cs->hwtemps[r].free = -1;
474
475 // Reset to some value that won't mess things up when the user
476 // tries to read from a temporary that hasn't been assigned a value yet.
477 // In the normal case, vector_valid and scalar_valid should be set to
478 // a sane value by the first emit that writes to this temporary.
479 cs->hwtemps[r].vector_valid = cs->nrslots;
480 cs->hwtemps[r].scalar_valid = cs->nrslots;
481
482 if (r > code->max_temp_idx)
483 code->max_temp_idx = r;
484
485 return r;
486 }
487
488 /**
489 * Mark the given hardware register as free.
490 */
491 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
492 {
493 // Be very careful here. Consider sequences like
494 // MAD r0, r1,r2,r3
495 // TEX r4, ...
496 // The TEX instruction may be moved in front of the MAD instruction
497 // due to the way nodes work. We don't want to alias r1 and r4 in
498 // this case.
499 // I'm certain the register allocation could be further sanitized,
500 // but it's tricky because of stuff that can happen inside emit_tex
501 // and emit_arith.
502 cs->hwtemps[idx].free = cs->nrslots + 1;
503 }
504
505 /**
506 * Create a new Mesa temporary register.
507 */
508 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
509 {
510 COMPILE_STATE;
511 GLuint r = undef;
512 GLuint index;
513
514 index = ffs(~cs->temp_in_use);
515 if (!index) {
516 ERROR("Out of program temps\n");
517 return r;
518 }
519
520 cs->temp_in_use |= (1 << --index);
521 cs->temps[index].refcount = 0xFFFFFFFF;
522 cs->temps[index].reg = -1;
523
524 REG_SET_TYPE(r, REG_TYPE_TEMP);
525 REG_SET_INDEX(r, index);
526 REG_SET_VALID(r, GL_TRUE);
527 return r;
528 }
529
530 /**
531 * Free a Mesa temporary and the associated R300 temporary.
532 */
533 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
534 {
535 GLuint index = REG_GET_INDEX(r);
536
537 if (!(cs->temp_in_use & (1 << index)))
538 return;
539
540 if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
541 free_hw_temp(cs, cs->temps[index].reg);
542 cs->temps[index].reg = -1;
543 cs->temp_in_use &= ~(1 << index);
544 } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
545 free_hw_temp(cs, cs->inputs[index].reg);
546 cs->inputs[index].reg = -1;
547 }
548 }
549
550 /**
551 * Emit a hardware constant/parameter.
552 *
553 * \p cp Stable pointer to an array of 4 floats.
554 * The pointer must be stable in the sense that it remains to be valid
555 * and hold the contents of the constant/parameter throughout the lifetime
556 * of the fragment program (actually, up until the next time the fragment
557 * program is translated).
558 */
559 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
560 const GLfloat * cp)
561 {
562 COMPILE_STATE;
563 GLuint reg = undef;
564 int index;
565
566 for (index = 0; index < code->const_nr; ++index) {
567 if (code->constant[index] == cp)
568 break;
569 }
570
571 if (index >= code->const_nr) {
572 if (index >= PFS_NUM_CONST_REGS) {
573 ERROR("Out of hw constants!\n");
574 return reg;
575 }
576
577 code->const_nr++;
578 code->constant[index] = cp;
579 }
580
581 REG_SET_TYPE(reg, REG_TYPE_CONST);
582 REG_SET_INDEX(reg, index);
583 REG_SET_VALID(reg, GL_TRUE);
584 return reg;
585 }
586
587 static inline GLuint negate(GLuint r)
588 {
589 REG_NEGS(r);
590 REG_NEGV(r);
591 return r;
592 }
593
594 /* Hack, to prevent clobbering sources used multiple times when
595 * emulating non-native instructions
596 */
597 static inline GLuint keep(GLuint r)
598 {
599 REG_SET_NO_USE(r, GL_TRUE);
600 return r;
601 }
602
603 static inline GLuint absolute(GLuint r)
604 {
605 REG_ABS(r);
606 return r;
607 }
608
609 static int swz_native(struct r300_pfs_compile_state *cs,
610 GLuint src, GLuint * r, GLuint arbneg)
611 {
612 COMPILE_STATE;
613
614 /* Native swizzle, handle negation */
615 src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
616
617 if ((arbneg & 0x7) == 0x0) {
618 src = src & ~REG_NEGV_MASK;
619 *r = src;
620 } else if ((arbneg & 0x7) == 0x7) {
621 src |= REG_NEGV_MASK;
622 *r = src;
623 } else {
624 if (!REG_GET_VALID(*r))
625 *r = get_temp_reg(cs);
626 src |= REG_NEGV_MASK;
627 emit_arith(cs,
628 PFS_OP_MAD,
629 *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
630 src = src & ~REG_NEGV_MASK;
631 emit_arith(cs,
632 PFS_OP_MAD,
633 *r,
634 (arbneg ^ 0x7) | WRITEMASK_W,
635 src, pfs_one, pfs_zero, 0);
636 }
637
638 return 3;
639 }
640
641 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
642 GLuint src,
643 GLuint * r, int mask, int mc, GLuint arbneg)
644 {
645 COMPILE_STATE;
646 GLuint tmp;
647 GLuint wmask = 0;
648
649 if (!REG_GET_VALID(*r))
650 *r = get_temp_reg(cs);
651
652 /* A partial match, VSWZ/mask define what parts of the
653 * desired swizzle we match
654 */
655 if (mc + s_mask[mask].count == 3) {
656 wmask = WRITEMASK_W;
657 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
658 }
659
660 tmp = arbneg & s_mask[mask].mask;
661 if (tmp) {
662 tmp = tmp ^ s_mask[mask].mask;
663 if (tmp) {
664 emit_arith(cs,
665 PFS_OP_MAD,
666 *r,
667 arbneg & s_mask[mask].mask,
668 keep(src) | REG_NEGV_MASK,
669 pfs_one, pfs_zero, 0);
670 if (!wmask) {
671 REG_SET_NO_USE(src, GL_TRUE);
672 } else {
673 REG_SET_NO_USE(src, GL_FALSE);
674 }
675 emit_arith(cs,
676 PFS_OP_MAD,
677 *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
678 } else {
679 if (!wmask) {
680 REG_SET_NO_USE(src, GL_TRUE);
681 } else {
682 REG_SET_NO_USE(src, GL_FALSE);
683 }
684 emit_arith(cs,
685 PFS_OP_MAD,
686 *r,
687 (arbneg & s_mask[mask].mask) | wmask,
688 src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
689 }
690 } else {
691 if (!wmask) {
692 REG_SET_NO_USE(src, GL_TRUE);
693 } else {
694 REG_SET_NO_USE(src, GL_FALSE);
695 }
696 emit_arith(cs, PFS_OP_MAD,
697 *r,
698 s_mask[mask].mask | wmask,
699 src, pfs_one, pfs_zero, 0);
700 }
701
702 return s_mask[mask].count;
703 }
704
705 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
706 GLuint src, GLuint arbswz, GLuint arbneg)
707 {
708 COMPILE_STATE;
709 GLuint r = undef;
710 GLuint vswz;
711 int c_mask = 0;
712 int v_match = 0;
713
714 /* If swizzling from something without an XYZW native swizzle,
715 * emit result to a temp, and do new swizzle from the temp.
716 */
717 #if 0
718 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
719 GLuint temp = get_temp_reg(fp);
720 emit_arith(fp,
721 PFS_OP_MAD,
722 temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
723 src = temp;
724 }
725 #endif
726
727 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
728 GLuint vsrcswz =
729 (v_swiz[REG_GET_VSWZ(src)].
730 hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
731 REG_GET_SSWZ(src) << 9;
732 GLint i;
733
734 GLuint newswz = 0;
735 GLuint offset;
736 for (i = 0; i < 4; ++i) {
737 offset = GET_SWZ(arbswz, i);
738
739 newswz |=
740 (offset <= 3) ? GET_SWZ(vsrcswz,
741 offset) << i *
742 3 : offset << i * 3;
743 }
744
745 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
746 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
747 } else {
748 /* set scalar swizzling */
749 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
750
751 }
752 do {
753 vswz = REG_GET_VSWZ(src);
754 do {
755 int chash;
756
757 REG_SET_VSWZ(src, vswz);
758 chash = v_swiz[REG_GET_VSWZ(src)].hash &
759 s_mask[c_mask].hash;
760
761 if (chash == (arbswz & s_mask[c_mask].hash)) {
762 if (s_mask[c_mask].count == 3) {
763 v_match += swz_native(cs,
764 src, &r, arbneg);
765 } else {
766 v_match += swz_emit_partial(cs,
767 src,
768 &r,
769 c_mask,
770 v_match,
771 arbneg);
772 }
773
774 if (v_match == 3)
775 return r;
776
777 /* Fill with something invalid.. all 0's was
778 * wrong before, matched SWIZZLE_X. So all
779 * 1's will be okay for now
780 */
781 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
782 }
783 } while (v_swiz[++vswz].hash != PFS_INVAL);
784 REG_SET_VSWZ(src, SWIZZLE_XYZ);
785 } while (s_mask[++c_mask].hash != PFS_INVAL);
786
787 ERROR("should NEVER get here\n");
788 return r;
789 }
790
791 static GLuint t_src(struct r300_pfs_compile_state *cs,
792 struct prog_src_register fpsrc)
793 {
794 COMPILE_STATE;
795 GLuint r = undef;
796
797 switch (fpsrc.File) {
798 case PROGRAM_TEMPORARY:
799 REG_SET_INDEX(r, fpsrc.Index);
800 REG_SET_VALID(r, GL_TRUE);
801 REG_SET_TYPE(r, REG_TYPE_TEMP);
802 break;
803 case PROGRAM_INPUT:
804 REG_SET_INDEX(r, fpsrc.Index);
805 REG_SET_VALID(r, GL_TRUE);
806 REG_SET_TYPE(r, REG_TYPE_INPUT);
807 break;
808 case PROGRAM_LOCAL_PARAM:
809 r = emit_const4fv(cs,
810 fp->mesa_program.Base.LocalParams[fpsrc.
811 Index]);
812 break;
813 case PROGRAM_ENV_PARAM:
814 r = emit_const4fv(cs,
815 cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
816 break;
817 case PROGRAM_STATE_VAR:
818 case PROGRAM_NAMED_PARAM:
819 case PROGRAM_CONSTANT:
820 r = emit_const4fv(cs,
821 fp->mesa_program.Base.Parameters->
822 ParameterValues[fpsrc.Index]);
823 break;
824 case PROGRAM_BUILTIN:
825 switch(fpsrc.Swizzle) {
826 case SWIZZLE_1111: r = pfs_one; break;
827 case SWIZZLE_0000: r = pfs_zero; break;
828 default:
829 ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
830 break;
831 }
832 break;
833 default:
834 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
835 return r;
836 }
837
838 /* no point swizzling ONE/ZERO/HALF constants... */
839 if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
840 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
841 return r;
842 }
843
844 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
845 struct prog_src_register fpsrc)
846 {
847 struct prog_src_register src = fpsrc;
848 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
849
850 src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
851
852 return t_src(cs, src);
853 }
854
855 static GLuint t_dst(struct r300_pfs_compile_state *cs,
856 struct prog_dst_register dest)
857 {
858 COMPILE_STATE;
859 GLuint r = undef;
860
861 switch (dest.File) {
862 case PROGRAM_TEMPORARY:
863 REG_SET_INDEX(r, dest.Index);
864 REG_SET_VALID(r, GL_TRUE);
865 REG_SET_TYPE(r, REG_TYPE_TEMP);
866 return r;
867 case PROGRAM_OUTPUT:
868 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
869 switch (dest.Index) {
870 case FRAG_RESULT_COLR:
871 case FRAG_RESULT_DEPR:
872 REG_SET_INDEX(r, dest.Index);
873 REG_SET_VALID(r, GL_TRUE);
874 return r;
875 default:
876 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
877 return r;
878 }
879 default:
880 ERROR("Bad DstReg->File 0x%x\n", dest.File);
881 return r;
882 }
883 }
884
885 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
886 {
887 COMPILE_STATE;
888 int idx;
889 int index = REG_GET_INDEX(src);
890
891 switch (REG_GET_TYPE(src)) {
892 case REG_TYPE_TEMP:
893 /* NOTE: if reg==-1 here, a source is being read that
894 * hasn't been written to. Undefined results.
895 */
896 if (cs->temps[index].reg == -1)
897 cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
898
899 idx = cs->temps[index].reg;
900
901 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
902 free_temp(cs, src);
903 break;
904 case REG_TYPE_INPUT:
905 idx = cs->inputs[index].reg;
906
907 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
908 free_hw_temp(cs, cs->inputs[index].reg);
909 break;
910 case REG_TYPE_CONST:
911 return (index | SRC_CONST);
912 default:
913 ERROR("Invalid type for source reg\n");
914 return (0 | SRC_CONST);
915 }
916
917 if (!tex)
918 cs->used_in_node |= (1 << idx);
919
920 return idx;
921 }
922
923 static int t_hw_dst(struct r300_pfs_compile_state *cs,
924 GLuint dest, GLboolean tex, int slot)
925 {
926 COMPILE_STATE;
927 int idx;
928 GLuint index = REG_GET_INDEX(dest);
929 assert(REG_GET_VALID(dest));
930
931 switch (REG_GET_TYPE(dest)) {
932 case REG_TYPE_TEMP:
933 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
934 if (!tex) {
935 cs->temps[index].reg = get_hw_temp(cs, slot);
936 } else {
937 cs->temps[index].reg = get_hw_temp_tex(cs);
938 }
939 }
940 idx = cs->temps[index].reg;
941
942 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
943 free_temp(cs, dest);
944
945 cs->dest_in_node |= (1 << idx);
946 cs->used_in_node |= (1 << idx);
947 break;
948 case REG_TYPE_OUTPUT:
949 switch (index) {
950 case FRAG_RESULT_COLR:
951 code->node[code->cur_node].flags |= R300_RGBA_OUT;
952 break;
953 case FRAG_RESULT_DEPR:
954 fp->WritesDepth = GL_TRUE;
955 code->node[code->cur_node].flags |= R300_W_OUT;
956 break;
957 }
958 return index;
959 break;
960 default:
961 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
962 return 0;
963 }
964
965 return idx;
966 }
967
968 static void emit_nop(struct r300_pfs_compile_state *cs)
969 {
970 COMPILE_STATE;
971
972 if (cs->nrslots >= PFS_MAX_ALU_INST) {
973 ERROR("Out of ALU instruction slots\n");
974 return;
975 }
976
977 code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
978 code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
979 code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
980 code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
981 cs->nrslots++;
982 }
983
984 static void emit_tex(struct r300_pfs_compile_state *cs,
985 struct prog_instruction *fpi, int opcode)
986 {
987 COMPILE_STATE;
988 GLuint coord = t_src(cs, fpi->SrcReg[0]);
989 GLuint dest = undef;
990 GLuint din, uin;
991 int unit = fpi->TexSrcUnit;
992 int hwsrc, hwdest;
993
994 /* Ensure correct node indirection */
995 uin = cs->used_in_node;
996 din = cs->dest_in_node;
997
998 /* Resolve source/dest to hardware registers */
999 hwsrc = t_hw_src(cs, coord, GL_TRUE);
1000
1001 if (opcode != R300_TEX_OP_KIL) {
1002 dest = t_dst(cs, fpi->DstReg);
1003
1004 hwdest =
1005 t_hw_dst(cs, dest, GL_TRUE,
1006 code->node[code->cur_node].alu_offset);
1007
1008 /* Use a temp that hasn't been used in this node, rather
1009 * than causing an indirection
1010 */
1011 if (uin & (1 << hwdest)) {
1012 free_hw_temp(cs, hwdest);
1013 hwdest = get_hw_temp_tex(cs);
1014 cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1015 }
1016 } else {
1017 hwdest = 0;
1018 unit = 0;
1019 }
1020
1021 /* Indirection if source has been written in this node, or if the
1022 * dest has been read/written in this node
1023 */
1024 if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1025 (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1026
1027 /* Finish off current node */
1028 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1029 emit_nop(cs);
1030
1031 code->node[code->cur_node].alu_end =
1032 cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1033 assert(code->node[code->cur_node].alu_end >= 0);
1034
1035 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1036 ERROR("too many levels of texture indirection\n");
1037 return;
1038 }
1039
1040 /* Start new node */
1041 code->node[code->cur_node].tex_offset = code->tex.length;
1042 code->node[code->cur_node].alu_offset = cs->nrslots;
1043 code->node[code->cur_node].tex_end = -1;
1044 code->node[code->cur_node].alu_end = -1;
1045 code->node[code->cur_node].flags = 0;
1046 cs->used_in_node = 0;
1047 cs->dest_in_node = 0;
1048 }
1049
1050 if (code->cur_node == 0)
1051 code->first_node_has_tex = 1;
1052
1053 code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1054 | (hwdest << R300_DST_ADDR_SHIFT)
1055 | (unit << R300_TEX_ID_SHIFT)
1056 | (opcode << R300_TEX_INST_SHIFT);
1057
1058 cs->dest_in_node |= (1 << hwdest);
1059 if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1060 cs->used_in_node |= (1 << hwsrc);
1061
1062 code->node[code->cur_node].tex_end++;
1063 }
1064
1065 /**
1066 * Returns the first slot where we could possibly allow writing to dest,
1067 * according to register allocation.
1068 */
1069 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1070 GLuint dest, int mask)
1071 {
1072 COMPILE_STATE;
1073 int idx;
1074 int pos;
1075 GLuint index = REG_GET_INDEX(dest);
1076 assert(REG_GET_VALID(dest));
1077
1078 switch (REG_GET_TYPE(dest)) {
1079 case REG_TYPE_TEMP:
1080 if (cs->temps[index].reg == -1)
1081 return 0;
1082
1083 idx = cs->temps[index].reg;
1084 break;
1085 case REG_TYPE_OUTPUT:
1086 return 0;
1087 default:
1088 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1089 return 0;
1090 }
1091
1092 pos = cs->hwtemps[idx].reserved;
1093 if (mask & WRITEMASK_XYZ) {
1094 if (pos < cs->hwtemps[idx].vector_lastread)
1095 pos = cs->hwtemps[idx].vector_lastread;
1096 }
1097 if (mask & WRITEMASK_W) {
1098 if (pos < cs->hwtemps[idx].scalar_lastread)
1099 pos = cs->hwtemps[idx].scalar_lastread;
1100 }
1101
1102 return pos;
1103 }
1104
1105 /**
1106 * Allocates a slot for an ALU instruction that can consist of
1107 * a vertex part or a scalar part or both.
1108 *
1109 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1110 * appropriate position (vector and/or scalar), and their positions are
1111 * recorded in the srcpos array.
1112 *
1113 * This function emits instruction code for the source fetch and the
1114 * argument selection. It does not emit instruction code for the
1115 * opcode or the destination selection.
1116 *
1117 * @return the index of the slot
1118 */
1119 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1120 GLboolean emit_vop,
1121 GLboolean emit_sop,
1122 int argc, GLuint * src, GLuint dest, int mask)
1123 {
1124 COMPILE_STATE;
1125 int hwsrc[3];
1126 int srcpos[3];
1127 unsigned int used;
1128 int tempused;
1129 int tempvsrc[3];
1130 int tempssrc[3];
1131 int pos;
1132 int regnr;
1133 int i, j;
1134
1135 // Determine instruction slots, whether sources are required on
1136 // vector or scalar side, and the smallest slot number where
1137 // all source registers are available
1138 used = 0;
1139 if (emit_vop)
1140 used |= SLOT_OP_VECTOR;
1141 if (emit_sop)
1142 used |= SLOT_OP_SCALAR;
1143
1144 pos = get_earliest_allowed_write(cs, dest, mask);
1145
1146 if (code->node[code->cur_node].alu_offset > pos)
1147 pos = code->node[code->cur_node].alu_offset;
1148 for (i = 0; i < argc; ++i) {
1149 if (!REG_GET_BUILTIN(src[i])) {
1150 if (emit_vop)
1151 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1152 if (emit_sop)
1153 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1154 }
1155
1156 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1157 regnr = hwsrc[i] & 31;
1158
1159 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1160 if (used & (SLOT_SRC_VECTOR << i)) {
1161 if (cs->hwtemps[regnr].vector_valid > pos)
1162 pos = cs->hwtemps[regnr].vector_valid;
1163 }
1164 if (used & (SLOT_SRC_SCALAR << i)) {
1165 if (cs->hwtemps[regnr].scalar_valid > pos)
1166 pos = cs->hwtemps[regnr].scalar_valid;
1167 }
1168 }
1169 }
1170
1171 // Find a slot that fits
1172 for (;; ++pos) {
1173 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1174 continue;
1175
1176 if (pos >= cs->nrslots) {
1177 if (cs->nrslots >= PFS_MAX_ALU_INST) {
1178 ERROR("Out of ALU instruction slots\n");
1179 return -1;
1180 }
1181
1182 code->alu.inst[pos].inst0 = NOP_INST0;
1183 code->alu.inst[pos].inst1 = NOP_INST1;
1184 code->alu.inst[pos].inst2 = NOP_INST2;
1185 code->alu.inst[pos].inst3 = NOP_INST3;
1186
1187 cs->nrslots++;
1188 }
1189 // Note: When we need both parts (vector and scalar) of a source,
1190 // we always try to put them into the same position. This makes the
1191 // code easier to read, and it is optimal (i.e. one doesn't gain
1192 // anything by splitting the parts).
1193 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1194 tempused = cs->slot[pos].used;
1195 for (i = 0; i < 3; ++i) {
1196 tempvsrc[i] = cs->slot[pos].vsrc[i];
1197 tempssrc[i] = cs->slot[pos].ssrc[i];
1198 }
1199
1200 for (i = 0; i < argc; ++i) {
1201 int flags = (used >> i) & SLOT_SRC_BOTH;
1202
1203 if (!flags) {
1204 srcpos[i] = 0;
1205 continue;
1206 }
1207
1208 for (j = 0; j < 3; ++j) {
1209 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1210 if (tempvsrc[j] != hwsrc[i])
1211 continue;
1212 }
1213
1214 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1215 if (tempssrc[j] != hwsrc[i])
1216 continue;
1217 }
1218
1219 break;
1220 }
1221
1222 if (j == 3)
1223 break;
1224
1225 srcpos[i] = j;
1226 tempused |= flags << j;
1227 if (flags & SLOT_SRC_VECTOR)
1228 tempvsrc[j] = hwsrc[i];
1229 if (flags & SLOT_SRC_SCALAR)
1230 tempssrc[j] = hwsrc[i];
1231 }
1232
1233 if (i == argc)
1234 break;
1235 }
1236
1237 // Found a slot, reserve it
1238 cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1239 for (i = 0; i < 3; ++i) {
1240 cs->slot[pos].vsrc[i] = tempvsrc[i];
1241 cs->slot[pos].ssrc[i] = tempssrc[i];
1242 }
1243
1244 for (i = 0; i < argc; ++i) {
1245 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1246 int regnr = hwsrc[i] & 31;
1247
1248 if (used & (SLOT_SRC_VECTOR << i)) {
1249 if (cs->hwtemps[regnr].vector_lastread < pos)
1250 cs->hwtemps[regnr].vector_lastread =
1251 pos;
1252 }
1253 if (used & (SLOT_SRC_SCALAR << i)) {
1254 if (cs->hwtemps[regnr].scalar_lastread < pos)
1255 cs->hwtemps[regnr].scalar_lastread =
1256 pos;
1257 }
1258 }
1259 }
1260
1261 // Emit the source fetch code
1262 code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1263 code->alu.inst[pos].inst1 |=
1264 ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1265 (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1266 (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1267
1268 code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1269 code->alu.inst[pos].inst3 |=
1270 ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1271 (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1272 (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1273
1274 // Emit the argument selection code
1275 if (emit_vop) {
1276 int swz[3];
1277
1278 for (i = 0; i < 3; ++i) {
1279 if (i < argc) {
1280 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1281 (srcpos[i] *
1282 v_swiz[REG_GET_VSWZ(src[i])].
1283 stride)) | ((src[i] & REG_NEGV_MASK)
1284 ? ARG_NEG : 0) | ((src[i]
1285 &
1286 REG_ABS_MASK)
1287 ?
1288 ARG_ABS
1289 : 0);
1290 } else {
1291 swz[i] = R300_ALU_ARGC_ZERO;
1292 }
1293 }
1294
1295 code->alu.inst[pos].inst0 &=
1296 ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1297 R300_ALU_ARG2C_MASK);
1298 code->alu.inst[pos].inst0 |=
1299 (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1300 R300_ALU_ARG1C_SHIFT)
1301 | (swz[2] << R300_ALU_ARG2C_SHIFT);
1302 }
1303
1304 if (emit_sop) {
1305 int swz[3];
1306
1307 for (i = 0; i < 3; ++i) {
1308 if (i < argc) {
1309 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1310 (srcpos[i] *
1311 s_swiz[REG_GET_SSWZ(src[i])].
1312 stride)) | ((src[i] & REG_NEGV_MASK)
1313 ? ARG_NEG : 0) | ((src[i]
1314 &
1315 REG_ABS_MASK)
1316 ?
1317 ARG_ABS
1318 : 0);
1319 } else {
1320 swz[i] = R300_ALU_ARGA_ZERO;
1321 }
1322 }
1323
1324 code->alu.inst[pos].inst2 &=
1325 ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1326 R300_ALU_ARG2A_MASK);
1327 code->alu.inst[pos].inst2 |=
1328 (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1329 R300_ALU_ARG1A_SHIFT)
1330 | (swz[2] << R300_ALU_ARG2A_SHIFT);
1331 }
1332
1333 return pos;
1334 }
1335
1336 /**
1337 * Append an ALU instruction to the instruction list.
1338 */
1339 static void emit_arith(struct r300_pfs_compile_state *cs,
1340 int op,
1341 GLuint dest,
1342 int mask,
1343 GLuint src0, GLuint src1, GLuint src2, int flags)
1344 {
1345 COMPILE_STATE;
1346 GLuint src[3] = { src0, src1, src2 };
1347 int hwdest;
1348 GLboolean emit_vop, emit_sop;
1349 int vop, sop, argc;
1350 int pos;
1351
1352 vop = r300_fpop[op].v_op;
1353 sop = r300_fpop[op].s_op;
1354 argc = r300_fpop[op].argc;
1355
1356 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1357 REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1358 if (mask & WRITEMASK_Z) {
1359 mask = WRITEMASK_W;
1360 } else {
1361 return;
1362 }
1363 }
1364
1365 emit_vop = GL_FALSE;
1366 emit_sop = GL_FALSE;
1367 if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1368 emit_vop = GL_TRUE;
1369 if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1370 emit_sop = GL_TRUE;
1371
1372 pos =
1373 find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1374 mask);
1375 if (pos < 0)
1376 return;
1377
1378 hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1379
1380 if (flags & PFS_FLAG_SAT) {
1381 vop |= R300_ALU_OUTC_CLAMP;
1382 sop |= R300_ALU_OUTA_CLAMP;
1383 }
1384
1385 /* Throw the pieces together and get ALU/1 */
1386 if (emit_vop) {
1387 code->alu.inst[pos].inst0 |= vop;
1388
1389 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1390
1391 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1392 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1393 code->alu.inst[pos].inst1 |=
1394 (mask & WRITEMASK_XYZ) <<
1395 R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1396 } else
1397 assert(0);
1398 } else {
1399 code->alu.inst[pos].inst1 |=
1400 (mask & WRITEMASK_XYZ) <<
1401 R300_ALU_DSTC_REG_MASK_SHIFT;
1402
1403 cs->hwtemps[hwdest].vector_valid = pos + 1;
1404 }
1405 }
1406
1407 /* And now ALU/3 */
1408 if (emit_sop) {
1409 code->alu.inst[pos].inst2 |= sop;
1410
1411 if (mask & WRITEMASK_W) {
1412 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1413 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1414 code->alu.inst[pos].inst3 |=
1415 (hwdest << R300_ALU_DSTA_SHIFT) |
1416 R300_ALU_DSTA_OUTPUT;
1417 } else if (REG_GET_INDEX(dest) ==
1418 FRAG_RESULT_DEPR) {
1419 code->alu.inst[pos].inst3 |=
1420 R300_ALU_DSTA_DEPTH;
1421 } else
1422 assert(0);
1423 } else {
1424 code->alu.inst[pos].inst3 |=
1425 (hwdest << R300_ALU_DSTA_SHIFT) |
1426 R300_ALU_DSTA_REG;
1427
1428 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1429 }
1430 }
1431 }
1432
1433 return;
1434 }
1435
1436 static GLfloat SinCosConsts[2][4] = {
1437 {
1438 1.273239545, // 4/PI
1439 -0.405284735, // -4/(PI*PI)
1440 3.141592654, // PI
1441 0.2225 // weight
1442 },
1443 {
1444 0.75,
1445 0.0,
1446 0.159154943, // 1/(2*PI)
1447 6.283185307 // 2*PI
1448 }
1449 };
1450
1451 /**
1452 * Emit a LIT instruction.
1453 * \p flags may be PFS_FLAG_SAT
1454 *
1455 * Definition of LIT (from ARB_fragment_program):
1456 * tmp = VectorLoad(op0);
1457 * if (tmp.x < 0) tmp.x = 0;
1458 * if (tmp.y < 0) tmp.y = 0;
1459 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1460 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1461 * result.x = 1.0;
1462 * result.y = tmp.x;
1463 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1464 * result.w = 1.0;
1465 *
1466 * The longest path of computation is the one leading to result.z,
1467 * consisting of 5 operations. This implementation of LIT takes
1468 * 5 slots. So unless there's some special undocumented opcode,
1469 * this implementation is potentially optimal. Unfortunately,
1470 * emit_arith is a bit too conservative because it doesn't understand
1471 * partial writes to the vector component.
1472 */
1473 static const GLfloat LitConst[4] =
1474 { 127.999999, 127.999999, 127.999999, -127.999999 };
1475
1476 static void emit_lit(struct r300_pfs_compile_state *cs,
1477 GLuint dest, int mask, GLuint src, int flags)
1478 {
1479 COMPILE_STATE;
1480 GLuint cnst;
1481 int needTemporary;
1482 GLuint temp;
1483
1484 cnst = emit_const4fv(cs, LitConst);
1485
1486 needTemporary = 0;
1487 if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1488 needTemporary = 1;
1489 } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1490 // LIT is typically followed by DP3/DP4, so there's no point
1491 // in creating special code for this case
1492 needTemporary = 1;
1493 }
1494
1495 if (needTemporary) {
1496 temp = keep(get_temp_reg(cs));
1497 } else {
1498 temp = keep(dest);
1499 }
1500
1501 // Note: The order of emit_arith inside the slots is relevant,
1502 // because emit_arith only looks at scalar vs. vector when resolving
1503 // dependencies, and it does not consider individual vector components,
1504 // so swizzling between the two parts can create fake dependencies.
1505
1506 // First slot
1507 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1508 keep(src), pfs_zero, undef, 0);
1509 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1510
1511 // Second slot
1512 emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1513 swizzle(temp, W, W, W, W), cnst, undef, 0);
1514 emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1515 swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1516
1517 // Third slot
1518 // If desired, we saturate the y result here.
1519 // This does not affect the use as a condition variable in the CMP later
1520 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1521 temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1522 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1523 swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1524
1525 // Fourth slot
1526 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1527 pfs_one, pfs_one, pfs_zero, 0);
1528 emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1529
1530 // Fifth slot
1531 emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1532 pfs_zero, swizzle(temp, W, W, W, W),
1533 negate(swizzle(temp, Y, Y, Y, Y)), flags);
1534 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1535 pfs_zero, 0);
1536
1537 if (needTemporary) {
1538 emit_arith(cs, PFS_OP_MAD, dest, mask,
1539 temp, pfs_one, pfs_zero, flags);
1540 free_temp(cs, temp);
1541 } else {
1542 // Decrease refcount of the destination
1543 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1544 }
1545 }
1546
1547 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
1548 {
1549 COMPILE_STATE;
1550 GLuint src[3], dest, temp[2];
1551 int flags, mask = 0;
1552 int const_sin[2];
1553
1554 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1555 flags = PFS_FLAG_SAT;
1556 else
1557 flags = 0;
1558
1559 if (fpi->Opcode != OPCODE_KIL) {
1560 dest = t_dst(cs, fpi->DstReg);
1561 mask = fpi->DstReg.WriteMask;
1562 }
1563
1564 switch (fpi->Opcode) {
1565 case OPCODE_ABS:
1566 src[0] = t_src(cs, fpi->SrcReg[0]);
1567 emit_arith(cs, PFS_OP_MAD, dest, mask,
1568 absolute(src[0]), pfs_one, pfs_zero, flags);
1569 break;
1570 case OPCODE_ADD:
1571 src[0] = t_src(cs, fpi->SrcReg[0]);
1572 src[1] = t_src(cs, fpi->SrcReg[1]);
1573 emit_arith(cs, PFS_OP_MAD, dest, mask,
1574 src[0], pfs_one, src[1], flags);
1575 break;
1576 case OPCODE_CMP:
1577 src[0] = t_src(cs, fpi->SrcReg[0]);
1578 src[1] = t_src(cs, fpi->SrcReg[1]);
1579 src[2] = t_src(cs, fpi->SrcReg[2]);
1580 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1581 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1582 */
1583 emit_arith(cs, PFS_OP_CMP, dest, mask,
1584 src[2], src[1], src[0], flags);
1585 break;
1586 case OPCODE_COS:
1587 /*
1588 * cos using a parabola (see SIN):
1589 * cos(x):
1590 * x = (x/(2*PI))+0.75
1591 * x = frac(x)
1592 * x = (x*2*PI)-PI
1593 * result = sin(x)
1594 */
1595 temp[0] = get_temp_reg(cs);
1596 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1597 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1598 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1599
1600 /* add 0.5*PI and do range reduction */
1601
1602 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1603 swizzle(src[0], X, X, X, X),
1604 swizzle(const_sin[1], Z, Z, Z, Z),
1605 swizzle(const_sin[1], X, X, X, X), 0);
1606
1607 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1608 swizzle(temp[0], X, X, X, X),
1609 undef, undef, 0);
1610
1611 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
1612 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
1613 0);
1614
1615 /* SIN */
1616
1617 emit_arith(cs, PFS_OP_MAD, temp[0],
1618 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1619 Z, Z, Z,
1620 Z),
1621 const_sin[0], pfs_zero, 0);
1622
1623 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1624 swizzle(temp[0], Y, Y, Y, Y),
1625 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1626 swizzle(temp[0], X, X, X, X), 0);
1627
1628 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1629 swizzle(temp[0], X, X, X, X),
1630 absolute(swizzle(temp[0], X, X, X, X)),
1631 negate(swizzle(temp[0], X, X, X, X)), 0);
1632
1633 emit_arith(cs, PFS_OP_MAD, dest, mask,
1634 swizzle(temp[0], Y, Y, Y, Y),
1635 swizzle(const_sin[0], W, W, W, W),
1636 swizzle(temp[0], X, X, X, X), flags);
1637
1638 free_temp(cs, temp[0]);
1639 break;
1640 case OPCODE_DP3:
1641 src[0] = t_src(cs, fpi->SrcReg[0]);
1642 src[1] = t_src(cs, fpi->SrcReg[1]);
1643 emit_arith(cs, PFS_OP_DP3, dest, mask,
1644 src[0], src[1], undef, flags);
1645 break;
1646 case OPCODE_DP4:
1647 src[0] = t_src(cs, fpi->SrcReg[0]);
1648 src[1] = t_src(cs, fpi->SrcReg[1]);
1649 emit_arith(cs, PFS_OP_DP4, dest, mask,
1650 src[0], src[1], undef, flags);
1651 break;
1652 case OPCODE_DPH:
1653 src[0] = t_src(cs, fpi->SrcReg[0]);
1654 src[1] = t_src(cs, fpi->SrcReg[1]);
1655 /* src0.xyz1 -> temp
1656 * DP4 dest, temp, src1
1657 */
1658 emit_arith(cs, PFS_OP_DP4, dest, mask,
1659 swizzle(src[0], X, Y, Z, ONE), src[1],
1660 undef, flags);
1661 break;
1662 case OPCODE_DST:
1663 src[0] = t_src(cs, fpi->SrcReg[0]);
1664 src[1] = t_src(cs, fpi->SrcReg[1]);
1665 /* dest.y = src0.y * src1.y */
1666 if (mask & WRITEMASK_Y)
1667 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1668 keep(src[0]), keep(src[1]),
1669 pfs_zero, flags);
1670 /* dest.z = src0.z */
1671 if (mask & WRITEMASK_Z)
1672 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1673 src[0], pfs_one, pfs_zero, flags);
1674 /* result.x = 1.0
1675 * result.w = src1.w */
1676 if (mask & WRITEMASK_XW) {
1677 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */
1678 emit_arith(cs, PFS_OP_MAD, dest,
1679 mask & WRITEMASK_XW,
1680 src[1], pfs_one, pfs_zero, flags);
1681 }
1682 break;
1683 case OPCODE_EX2:
1684 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1685 emit_arith(cs, PFS_OP_EX2, dest, mask,
1686 src[0], undef, undef, flags);
1687 break;
1688 case OPCODE_FLR:
1689 src[0] = t_src(cs, fpi->SrcReg[0]);
1690 temp[0] = get_temp_reg(cs);
1691 /* FRC temp, src0
1692 * MAD dest, src0, 1.0, -temp
1693 */
1694 emit_arith(cs, PFS_OP_FRC, temp[0], mask,
1695 keep(src[0]), undef, undef, 0);
1696 emit_arith(cs, PFS_OP_MAD, dest, mask,
1697 src[0], pfs_one, negate(temp[0]), flags);
1698 free_temp(cs, temp[0]);
1699 break;
1700 case OPCODE_FRC:
1701 src[0] = t_src(cs, fpi->SrcReg[0]);
1702 emit_arith(cs, PFS_OP_FRC, dest, mask,
1703 src[0], undef, undef, flags);
1704 break;
1705 case OPCODE_KIL:
1706 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1707 break;
1708 case OPCODE_LG2:
1709 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1710 emit_arith(cs, PFS_OP_LG2, dest, mask,
1711 src[0], undef, undef, flags);
1712 break;
1713 case OPCODE_LIT:
1714 src[0] = t_src(cs, fpi->SrcReg[0]);
1715 emit_lit(cs, dest, mask, src[0], flags);
1716 break;
1717 case OPCODE_LRP:
1718 src[0] = t_src(cs, fpi->SrcReg[0]);
1719 src[1] = t_src(cs, fpi->SrcReg[1]);
1720 src[2] = t_src(cs, fpi->SrcReg[2]);
1721 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1722 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1723 * MAD temp, -tmp0, tmp2, tmp2
1724 * MAD result, tmp0, tmp1, temp
1725 */
1726 temp[0] = get_temp_reg(cs);
1727 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1728 negate(keep(src[0])), keep(src[2]), src[2],
1729 0);
1730 emit_arith(cs, PFS_OP_MAD, dest, mask,
1731 src[0], src[1], temp[0], flags);
1732 free_temp(cs, temp[0]);
1733 break;
1734 case OPCODE_MAD:
1735 src[0] = t_src(cs, fpi->SrcReg[0]);
1736 src[1] = t_src(cs, fpi->SrcReg[1]);
1737 src[2] = t_src(cs, fpi->SrcReg[2]);
1738 emit_arith(cs, PFS_OP_MAD, dest, mask,
1739 src[0], src[1], src[2], flags);
1740 break;
1741 case OPCODE_MAX:
1742 src[0] = t_src(cs, fpi->SrcReg[0]);
1743 src[1] = t_src(cs, fpi->SrcReg[1]);
1744 emit_arith(cs, PFS_OP_MAX, dest, mask,
1745 src[0], src[1], undef, flags);
1746 break;
1747 case OPCODE_MIN:
1748 src[0] = t_src(cs, fpi->SrcReg[0]);
1749 src[1] = t_src(cs, fpi->SrcReg[1]);
1750 emit_arith(cs, PFS_OP_MIN, dest, mask,
1751 src[0], src[1], undef, flags);
1752 break;
1753 case OPCODE_MOV:
1754 case OPCODE_SWZ:
1755 src[0] = t_src(cs, fpi->SrcReg[0]);
1756 emit_arith(cs, PFS_OP_MAD, dest, mask,
1757 src[0], pfs_one, pfs_zero, flags);
1758 break;
1759 case OPCODE_MUL:
1760 src[0] = t_src(cs, fpi->SrcReg[0]);
1761 src[1] = t_src(cs, fpi->SrcReg[1]);
1762 emit_arith(cs, PFS_OP_MAD, dest, mask,
1763 src[0], src[1], pfs_zero, flags);
1764 break;
1765 case OPCODE_POW:
1766 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1767 src[1] = t_scalar_src(cs, fpi->SrcReg[1]);
1768 temp[0] = get_temp_reg(cs);
1769 emit_arith(cs, PFS_OP_LG2, temp[0], WRITEMASK_W,
1770 src[0], undef, undef, 0);
1771 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1772 temp[0], src[1], pfs_zero, 0);
1773 emit_arith(cs, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1774 temp[0], undef, undef, 0);
1775 free_temp(cs, temp[0]);
1776 break;
1777 case OPCODE_RCP:
1778 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1779 emit_arith(cs, PFS_OP_RCP, dest, mask,
1780 src[0], undef, undef, flags);
1781 break;
1782 case OPCODE_RSQ:
1783 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1784 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1785 absolute(src[0]), pfs_zero, pfs_zero, flags);
1786 break;
1787 case OPCODE_SCS:
1788 /*
1789 * scs using a parabola :
1790 * scs(x):
1791 * result.x = sin(-abs(x)+0.5*PI) (cos)
1792 * result.y = sin(x) (sin)
1793 *
1794 */
1795 temp[0] = get_temp_reg(cs);
1796 temp[1] = get_temp_reg(cs);
1797 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1798 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1799 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1800
1801 /* x = -abs(x)+0.5*PI */
1802 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI
1803 pfs_half,
1804 negate(abs
1805 (swizzle(keep(src[0]), X, X, X, X))),
1806 0);
1807
1808 /* C*x (sin) */
1809 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1810 swizzle(const_sin[0], Y, Y, Y, Y),
1811 swizzle(keep(src[0]), X, X, X, X),
1812 pfs_zero, 0);
1813
1814 /* B*x, C*x (cos) */
1815 emit_arith(cs, PFS_OP_MAD, temp[0],
1816 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1817 Z, Z, Z,
1818 Z),
1819 const_sin[0], pfs_zero, 0);
1820
1821 /* B*x (sin) */
1822 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1823 swizzle(const_sin[0], X, X, X, X),
1824 keep(src[0]), pfs_zero, 0);
1825
1826 /* y = B*x + C*x*abs(x) (sin) */
1827 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1828 absolute(src[0]),
1829 swizzle(temp[0], W, W, W, W),
1830 swizzle(temp[1], W, W, W, W), 0);
1831
1832 /* y = B*x + C*x*abs(x) (cos) */
1833 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1834 swizzle(temp[0], Y, Y, Y, Y),
1835 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1836 swizzle(temp[0], X, X, X, X), 0);
1837
1838 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1839 emit_arith(cs, PFS_OP_MAD, temp[0],
1840 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1841 W, Z, Y,
1842 X),
1843 absolute(swizzle(temp[1], W, Z, Y, X)),
1844 negate(swizzle(temp[1], W, Z, Y, X)), 0);
1845
1846 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1847 emit_arith(cs, PFS_OP_MAD, dest,
1848 mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1849 swizzle(const_sin[0], W, W, W, W),
1850 swizzle(temp[1], W, Z, Y, X), flags);
1851
1852 free_temp(cs, temp[0]);
1853 free_temp(cs, temp[1]);
1854 break;
1855 case OPCODE_SGE:
1856 src[0] = t_src(cs, fpi->SrcReg[0]);
1857 src[1] = t_src(cs, fpi->SrcReg[1]);
1858 temp[0] = get_temp_reg(cs);
1859 /* temp = src0 - src1
1860 * dest.c = (temp.c < 0.0) ? 0 : 1
1861 */
1862 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1863 src[0], pfs_one, negate(src[1]), 0);
1864 emit_arith(cs, PFS_OP_CMP, dest, mask,
1865 pfs_one, pfs_zero, temp[0], 0);
1866 free_temp(cs, temp[0]);
1867 break;
1868 case OPCODE_SIN:
1869 /*
1870 * using a parabola:
1871 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1872 * extra precision is obtained by weighting against
1873 * itself squared.
1874 */
1875
1876 temp[0] = get_temp_reg(cs);
1877 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1878 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1879 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1880
1881 /* do range reduction */
1882
1883 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1884 swizzle(keep(src[0]), X, X, X, X),
1885 swizzle(const_sin[1], Z, Z, Z, Z),
1886 pfs_half, 0);
1887
1888 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1889 swizzle(temp[0], X, X, X, X),
1890 undef, undef, 0);
1891
1892 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
1893 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
1894 0);
1895
1896 /* SIN */
1897
1898 emit_arith(cs, PFS_OP_MAD, temp[0],
1899 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1900 Z, Z, Z,
1901 Z),
1902 const_sin[0], pfs_zero, 0);
1903
1904 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1905 swizzle(temp[0], Y, Y, Y, Y),
1906 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1907 swizzle(temp[0], X, X, X, X), 0);
1908
1909 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1910 swizzle(temp[0], X, X, X, X),
1911 absolute(swizzle(temp[0], X, X, X, X)),
1912 negate(swizzle(temp[0], X, X, X, X)), 0);
1913
1914 emit_arith(cs, PFS_OP_MAD, dest, mask,
1915 swizzle(temp[0], Y, Y, Y, Y),
1916 swizzle(const_sin[0], W, W, W, W),
1917 swizzle(temp[0], X, X, X, X), flags);
1918
1919 free_temp(cs, temp[0]);
1920 break;
1921 case OPCODE_SLT:
1922 src[0] = t_src(cs, fpi->SrcReg[0]);
1923 src[1] = t_src(cs, fpi->SrcReg[1]);
1924 temp[0] = get_temp_reg(cs);
1925 /* temp = src0 - src1
1926 * dest.c = (temp.c < 0.0) ? 1 : 0
1927 */
1928 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1929 src[0], pfs_one, negate(src[1]), 0);
1930 emit_arith(cs, PFS_OP_CMP, dest, mask,
1931 pfs_zero, pfs_one, temp[0], 0);
1932 free_temp(cs, temp[0]);
1933 break;
1934 case OPCODE_SUB:
1935 src[0] = t_src(cs, fpi->SrcReg[0]);
1936 src[1] = t_src(cs, fpi->SrcReg[1]);
1937 emit_arith(cs, PFS_OP_MAD, dest, mask,
1938 src[0], pfs_one, negate(src[1]), flags);
1939 break;
1940 case OPCODE_TEX:
1941 emit_tex(cs, fpi, R300_TEX_OP_LD);
1942 break;
1943 case OPCODE_TXB:
1944 emit_tex(cs, fpi, R300_TEX_OP_TXB);
1945 break;
1946 case OPCODE_TXP:
1947 emit_tex(cs, fpi, R300_TEX_OP_TXP);
1948 break;
1949 case OPCODE_XPD:{
1950 src[0] = t_src(cs, fpi->SrcReg[0]);
1951 src[1] = t_src(cs, fpi->SrcReg[1]);
1952 temp[0] = get_temp_reg(cs);
1953 /* temp = src0.zxy * src1.yzx */
1954 emit_arith(cs, PFS_OP_MAD, temp[0],
1955 WRITEMASK_XYZ, swizzle(keep(src[0]),
1956 Z, X, Y, W),
1957 swizzle(keep(src[1]), Y, Z, X, W),
1958 pfs_zero, 0);
1959 /* dest.xyz = src0.yzx * src1.zxy - temp
1960 * dest.w = undefined
1961 * */
1962 emit_arith(cs, PFS_OP_MAD, dest,
1963 mask & WRITEMASK_XYZ, swizzle(src[0],
1964 Y, Z,
1965 X, W),
1966 swizzle(src[1], Z, X, Y, W),
1967 negate(temp[0]), flags);
1968 /* cleanup */
1969 free_temp(cs, temp[0]);
1970 break;
1971 }
1972 default:
1973 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1974 break;
1975 }
1976 }
1977
1978 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1979 {
1980 COMPILE_STATE;
1981 int clauseidx;
1982
1983 for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
1984 struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
1985 int ip;
1986
1987 for(ip = 0; ip < clause->NumInstructions; ++ip) {
1988 emit_instruction(cs, clause->Instructions + ip);
1989
1990 if (fp->error)
1991 return GL_FALSE;
1992 }
1993 }
1994
1995 return GL_TRUE;
1996 }
1997
1998
1999 /* - Init structures
2000 * - Determine what hwregs each input corresponds to
2001 */
2002 static void init_program(struct r300_pfs_compile_state *cs)
2003 {
2004 COMPILE_STATE;
2005 struct gl_fragment_program *mp = &fp->mesa_program;
2006 GLuint InputsRead = mp->Base.InputsRead;
2007 GLuint temps_used = 0; /* for fp->temps[] */
2008 int i, j;
2009
2010 /* New compile, reset tracking data */
2011 fp->optimization =
2012 driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
2013 fp->translated = GL_FALSE;
2014 fp->error = GL_FALSE;
2015 fp->WritesDepth = GL_FALSE;
2016 code->tex.length = 0;
2017 code->cur_node = 0;
2018 code->first_node_has_tex = 0;
2019 code->const_nr = 0;
2020 code->max_temp_idx = 0;
2021 code->node[0].alu_end = -1;
2022 code->node[0].tex_end = -1;
2023
2024 for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2025 for (j = 0; j < 3; j++) {
2026 cs->slot[i].vsrc[j] = SRC_CONST;
2027 cs->slot[i].ssrc[j] = SRC_CONST;
2028 }
2029 }
2030
2031 /* Work out what temps the Mesa inputs correspond to, this must match
2032 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2033 * configures itself based on the fragprog's InputsRead
2034 *
2035 * NOTE: this depends on get_hw_temp() allocating registers in order,
2036 * starting from register 0.
2037 */
2038
2039 /* Texcoords come first */
2040 for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
2041 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2042 cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2043 cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2044 get_hw_temp(cs, 0);
2045 }
2046 }
2047 InputsRead &= ~FRAG_BITS_TEX_ANY;
2048
2049 /* fragment position treated as a texcoord */
2050 if (InputsRead & FRAG_BIT_WPOS) {
2051 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2052 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
2053 }
2054 InputsRead &= ~FRAG_BIT_WPOS;
2055
2056 /* Then primary colour */
2057 if (InputsRead & FRAG_BIT_COL0) {
2058 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2059 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
2060 }
2061 InputsRead &= ~FRAG_BIT_COL0;
2062
2063 /* Secondary color */
2064 if (InputsRead & FRAG_BIT_COL1) {
2065 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2066 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
2067 }
2068 InputsRead &= ~FRAG_BIT_COL1;
2069
2070 /* Anything else */
2071 if (InputsRead) {
2072 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2073 /* force read from hwreg 0 for now */
2074 for (i = 0; i < 32; i++)
2075 if (InputsRead & (1 << i))
2076 cs->inputs[i].reg = 0;
2077 }
2078
2079 /* Pre-parse the program, grabbing refcounts on input/temp regs.
2080 * That way, we can free up the reg when it's no longer needed
2081 */
2082 for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
2083 struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
2084 int idx;
2085
2086 for (j = 0; j < 3; j++) {
2087 idx = fpi->SrcReg[j].Index;
2088 switch (fpi->SrcReg[j].File) {
2089 case PROGRAM_TEMPORARY:
2090 if (!(temps_used & (1 << idx))) {
2091 cs->temps[idx].reg = -1;
2092 cs->temps[idx].refcount = 1;
2093 temps_used |= (1 << idx);
2094 } else
2095 cs->temps[idx].refcount++;
2096 break;
2097 case PROGRAM_INPUT:
2098 cs->inputs[idx].refcount++;
2099 break;
2100 default:
2101 break;
2102 }
2103 }
2104
2105 idx = fpi->DstReg.Index;
2106 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2107 if (!(temps_used & (1 << idx))) {
2108 cs->temps[idx].reg = -1;
2109 cs->temps[idx].refcount = 1;
2110 temps_used |= (1 << idx);
2111 } else
2112 cs->temps[idx].refcount++;
2113 }
2114 }
2115 cs->temp_in_use = temps_used;
2116 }
2117
2118
2119 /**
2120 * Final compilation step: Turn the intermediate radeon_program into
2121 * machine-readable instructions.
2122 */
2123 GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
2124 {
2125 struct r300_pfs_compile_state cs;
2126 struct r300_fragment_program_code *code = compiler->code;
2127
2128 _mesa_memset(&cs, 0, sizeof(cs));
2129 cs.compiler = compiler;
2130 init_program(&cs);
2131
2132 if (!parse_program(&cs))
2133 return GL_FALSE;
2134
2135 /* Finish off */
2136 code->node[code->cur_node].alu_end =
2137 cs.nrslots - code->node[code->cur_node].alu_offset - 1;
2138 if (code->node[code->cur_node].tex_end < 0)
2139 code->node[code->cur_node].tex_end = 0;
2140 code->alu_offset = 0;
2141 code->alu_end = cs.nrslots - 1;
2142 code->tex_offset = 0;
2143 code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
2144 assert(code->node[code->cur_node].alu_end >= 0);
2145 assert(code->alu_end >= 0);
2146
2147 return GL_TRUE;
2148 }
2149