r300: Add radeonTransformALU and fix a bug in r300_fragprog DPH
[mesa.git] / src / mesa / drivers / dri / r300 / r300_fragprog_emit.c
1 /*
2 * Copyright (C) 2005 Ben Skeggs.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * \file
30 *
31 * Emit the r300_fragment_program_code that can be understood by the hardware.
32 * Input is a pre-transformed radeon_program.
33 *
34 * \author Ben Skeggs <darktama@iinet.net.au>
35 *
36 * \author Jerome Glisse <j.glisse@gmail.com>
37 *
38 * \todo FogOption
39 *
40 * \todo Verify results of opcodes for accuracy, I've only checked them in
41 * specific cases.
42 */
43
44 #include "glheader.h"
45 #include "macros.h"
46 #include "enums.h"
47 #include "shader/prog_instruction.h"
48 #include "shader/prog_parameter.h"
49 #include "shader/prog_print.h"
50
51 #include "r300_context.h"
52 #include "r300_fragprog.h"
53 #include "r300_reg.h"
54 #include "r300_state.h"
55
56 /* Mapping Mesa registers to R300 temporaries */
57 struct reg_acc {
58 int reg; /* Assigned hw temp */
59 unsigned int refcount; /* Number of uses by mesa program */
60 };
61
62 /**
63 * Describe the current lifetime information for an R300 temporary
64 */
65 struct reg_lifetime {
66 /* Index of the first slot where this register is free in the sense
67 that it can be used as a new destination register.
68 This is -1 if the register has been assigned to a Mesa register
69 and the last access to the register has not yet been emitted */
70 int free;
71
72 /* Index of the first slot where this register is currently reserved.
73 This is used to stop e.g. a scalar operation from being moved
74 before the allocation time of a register that was first allocated
75 for a vector operation. */
76 int reserved;
77
78 /* Index of the first slot in which the register can be used as a
79 source without losing the value that is written by the last
80 emitted instruction that writes to the register */
81 int vector_valid;
82 int scalar_valid;
83
84 /* Index to the slot where the register was last read.
85 This is also the first slot in which the register may be written again */
86 int vector_lastread;
87 int scalar_lastread;
88 };
89
90 /**
91 * Store usage information about an ALU instruction slot during the
92 * compilation of a fragment program.
93 */
94 #define SLOT_SRC_VECTOR (1<<0)
95 #define SLOT_SRC_SCALAR (1<<3)
96 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
97 #define SLOT_OP_VECTOR (1<<16)
98 #define SLOT_OP_SCALAR (1<<17)
99 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
100
101 struct r300_pfs_compile_slot {
102 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
103 defined above */
104 unsigned int used;
105
106 /* Selected sources */
107 int vsrc[3];
108 int ssrc[3];
109 };
110
111 /**
112 * Store information during compilation of fragment programs.
113 */
114 struct r300_pfs_compile_state {
115 struct r300_fragment_program_compiler *compiler;
116
117 int nrslots; /* number of ALU slots used so far */
118
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
121
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
124
125 /* Used to map Mesa's inputs/temps onto hardware temps */
126 int temp_in_use;
127 struct reg_acc temps[PFS_NUM_TEMP_REGS];
128 struct reg_acc inputs[32]; /* don't actually need 32... */
129
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
132 GLuint used_in_node;
133 GLuint dest_in_node;
134 };
135
136
137 /*
138 * Usefull macros and values
139 */
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
144 } while(0)
145
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->compiler->fp; \
149 struct r300_fragment_program_code *code = cs->compiler->code; \
150 (void)code; (void)fp
151
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
163
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
165 ((SWIZZLE_##x<<0)| \
166 (SWIZZLE_##y<<3)| \
167 (SWIZZLE_##z<<6)| \
168 (SWIZZLE_##w<<9)), \
169 0)
170
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
175
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
186
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
197
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
247
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
266
267
268 /*
269 * Datas structures for fragment program generation
270 */
271
272 /* description of r300 native hw instructions */
273 static const struct {
274 const char *name;
275 int argc;
276 int v_op;
277 int s_op;
278 } r300_fpop[] = {
279 /* *INDENT-OFF* */
280 {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
281 {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
282 {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
283 {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
284 {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
285 {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
286 {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
293 /* *INDENT-ON* */
294 };
295
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
298 *
299 * REG_VSWZ/REG_SSWZ is an index into this table
300 */
301
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
304
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
306 SWIZZLE_##y, \
307 SWIZZLE_##z, \
308 SWIZZLE_ZERO))
309 /* native swizzles */
310 static const struct r300_pfs_swizzle {
311 GLuint hash; /* swizzle value this matches */
312 GLuint base; /* base value for hw swizzle */
313 GLuint stride; /* difference in base between arg0/1/2 */
314 GLuint flags;
315 } v_swiz[] = {
316 /* *INDENT-OFF* */
317 {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
318 {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
319 {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
320 {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
321 {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
322 {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
323 {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
324 {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
325 {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
326 {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
327 {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
328 {PFS_INVAL, 0, 0, 0},
329 /* *INDENT-ON* */
330 };
331
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash; /* used to mask matching swizzle components */
339 int mask; /* actual outmask */
340 int count; /* count of components matched */
341 } s_mask[] = {
342 /* *INDENT-OFF* */
343 {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
344 {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
345 {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
346 {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
347 {SWZ_X_MASK, 1, 1},
348 {SWZ_Y_MASK, 2, 1},
349 {SWZ_Z_MASK, 4, 1},
350 {PFS_INVAL, PFS_INVAL, PFS_INVAL}
351 /* *INDENT-ON* */
352 };
353
354 static const struct {
355 int base; /* hw value of swizzle */
356 int stride; /* difference between SRC0/1/2 */
357 GLuint flags;
358 } s_swiz[] = {
359 /* *INDENT-OFF* */
360 {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
361 {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
362 {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
363 {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
364 {R300_ALU_ARGA_ZERO, 0, 0},
365 {R300_ALU_ARGA_ONE, 0, 0},
366 {R300_ALU_ARGA_HALF, 0, 0}
367 /* *INDENT-ON* */
368 };
369
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef = REG(REG_TYPE_TEMP,
372 0,
373 SWIZZLE_XYZ,
374 SWIZZLE_W,
375 GL_FALSE,
376 GL_FALSE,
377 GL_FALSE);
378
379 /* constant one source */
380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
381 0,
382 SWIZZLE_111,
383 SWIZZLE_ONE,
384 GL_FALSE,
385 GL_TRUE,
386 GL_TRUE);
387
388 /* constant half source */
389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
390 0,
391 SWIZZLE_HHH,
392 SWIZZLE_HALF,
393 GL_FALSE,
394 GL_TRUE,
395 GL_TRUE);
396
397 /* constant zero source */
398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
399 0,
400 SWIZZLE_000,
401 SWIZZLE_ZERO,
402 GL_FALSE,
403 GL_TRUE,
404 GL_TRUE);
405
406 /*
407 * Common functions prototypes
408 */
409 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
410 GLuint dest, int mask,
411 GLuint src0, GLuint src1, GLuint src2, int flags);
412
413 /**
414 * Get an R300 temporary that can be written to in the given slot.
415 */
416 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
417 {
418 COMPILE_STATE;
419 int r;
420
421 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
422 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
423 break;
424 }
425
426 if (r >= PFS_NUM_TEMP_REGS) {
427 ERROR("Out of hardware temps\n");
428 return 0;
429 }
430 // Reserved is used to avoid the following scenario:
431 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
432 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
433 // Then scalar ops on Mesa temporary Z are emitted and move back in time
434 // to overwrite the value of temporary Y.
435 // End scenario.
436 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
437 cs->hwtemps[r].free = -1;
438
439 // Reset to some value that won't mess things up when the user
440 // tries to read from a temporary that hasn't been assigned a value yet.
441 // In the normal case, vector_valid and scalar_valid should be set to
442 // a sane value by the first emit that writes to this temporary.
443 cs->hwtemps[r].vector_valid = 0;
444 cs->hwtemps[r].scalar_valid = 0;
445
446 if (r > code->max_temp_idx)
447 code->max_temp_idx = r;
448
449 return r;
450 }
451
452 /**
453 * Get an R300 temporary that will act as a TEX destination register.
454 */
455 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
456 {
457 COMPILE_STATE;
458 int r;
459
460 for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
461 if (cs->used_in_node & (1 << r))
462 continue;
463
464 // Note: Be very careful here
465 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
466 break;
467 }
468
469 if (r >= PFS_NUM_TEMP_REGS)
470 return get_hw_temp(cs, 0); /* Will cause an indirection */
471
472 cs->hwtemps[r].reserved = cs->hwtemps[r].free;
473 cs->hwtemps[r].free = -1;
474
475 // Reset to some value that won't mess things up when the user
476 // tries to read from a temporary that hasn't been assigned a value yet.
477 // In the normal case, vector_valid and scalar_valid should be set to
478 // a sane value by the first emit that writes to this temporary.
479 cs->hwtemps[r].vector_valid = cs->nrslots;
480 cs->hwtemps[r].scalar_valid = cs->nrslots;
481
482 if (r > code->max_temp_idx)
483 code->max_temp_idx = r;
484
485 return r;
486 }
487
488 /**
489 * Mark the given hardware register as free.
490 */
491 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
492 {
493 // Be very careful here. Consider sequences like
494 // MAD r0, r1,r2,r3
495 // TEX r4, ...
496 // The TEX instruction may be moved in front of the MAD instruction
497 // due to the way nodes work. We don't want to alias r1 and r4 in
498 // this case.
499 // I'm certain the register allocation could be further sanitized,
500 // but it's tricky because of stuff that can happen inside emit_tex
501 // and emit_arith.
502 cs->hwtemps[idx].free = cs->nrslots + 1;
503 }
504
505 /**
506 * Create a new Mesa temporary register.
507 */
508 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
509 {
510 COMPILE_STATE;
511 GLuint r = undef;
512 GLuint index;
513
514 index = ffs(~cs->temp_in_use);
515 if (!index) {
516 ERROR("Out of program temps\n");
517 return r;
518 }
519
520 cs->temp_in_use |= (1 << --index);
521 cs->temps[index].refcount = 0xFFFFFFFF;
522 cs->temps[index].reg = -1;
523
524 REG_SET_TYPE(r, REG_TYPE_TEMP);
525 REG_SET_INDEX(r, index);
526 REG_SET_VALID(r, GL_TRUE);
527 return r;
528 }
529
530 /**
531 * Free a Mesa temporary and the associated R300 temporary.
532 */
533 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
534 {
535 GLuint index = REG_GET_INDEX(r);
536
537 if (!(cs->temp_in_use & (1 << index)))
538 return;
539
540 if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
541 free_hw_temp(cs, cs->temps[index].reg);
542 cs->temps[index].reg = -1;
543 cs->temp_in_use &= ~(1 << index);
544 } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
545 free_hw_temp(cs, cs->inputs[index].reg);
546 cs->inputs[index].reg = -1;
547 }
548 }
549
550 /**
551 * Emit a hardware constant/parameter.
552 *
553 * \p cp Stable pointer to an array of 4 floats.
554 * The pointer must be stable in the sense that it remains to be valid
555 * and hold the contents of the constant/parameter throughout the lifetime
556 * of the fragment program (actually, up until the next time the fragment
557 * program is translated).
558 */
559 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
560 const GLfloat * cp)
561 {
562 COMPILE_STATE;
563 GLuint reg = undef;
564 int index;
565
566 for (index = 0; index < code->const_nr; ++index) {
567 if (code->constant[index] == cp)
568 break;
569 }
570
571 if (index >= code->const_nr) {
572 if (index >= PFS_NUM_CONST_REGS) {
573 ERROR("Out of hw constants!\n");
574 return reg;
575 }
576
577 code->const_nr++;
578 code->constant[index] = cp;
579 }
580
581 REG_SET_TYPE(reg, REG_TYPE_CONST);
582 REG_SET_INDEX(reg, index);
583 REG_SET_VALID(reg, GL_TRUE);
584 return reg;
585 }
586
587 static inline GLuint negate(GLuint r)
588 {
589 REG_NEGS(r);
590 REG_NEGV(r);
591 return r;
592 }
593
594 /* Hack, to prevent clobbering sources used multiple times when
595 * emulating non-native instructions
596 */
597 static inline GLuint keep(GLuint r)
598 {
599 REG_SET_NO_USE(r, GL_TRUE);
600 return r;
601 }
602
603 static inline GLuint absolute(GLuint r)
604 {
605 REG_ABS(r);
606 return r;
607 }
608
609 static int swz_native(struct r300_pfs_compile_state *cs,
610 GLuint src, GLuint * r, GLuint arbneg)
611 {
612 COMPILE_STATE;
613
614 /* Native swizzle, handle negation */
615 src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
616
617 if ((arbneg & 0x7) == 0x0) {
618 src = src & ~REG_NEGV_MASK;
619 *r = src;
620 } else if ((arbneg & 0x7) == 0x7) {
621 src |= REG_NEGV_MASK;
622 *r = src;
623 } else {
624 if (!REG_GET_VALID(*r))
625 *r = get_temp_reg(cs);
626 src |= REG_NEGV_MASK;
627 emit_arith(cs,
628 PFS_OP_MAD,
629 *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
630 src = src & ~REG_NEGV_MASK;
631 emit_arith(cs,
632 PFS_OP_MAD,
633 *r,
634 (arbneg ^ 0x7) | WRITEMASK_W,
635 src, pfs_one, pfs_zero, 0);
636 }
637
638 return 3;
639 }
640
641 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
642 GLuint src,
643 GLuint * r, int mask, int mc, GLuint arbneg)
644 {
645 COMPILE_STATE;
646 GLuint tmp;
647 GLuint wmask = 0;
648
649 if (!REG_GET_VALID(*r))
650 *r = get_temp_reg(cs);
651
652 /* A partial match, VSWZ/mask define what parts of the
653 * desired swizzle we match
654 */
655 if (mc + s_mask[mask].count == 3) {
656 wmask = WRITEMASK_W;
657 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
658 }
659
660 tmp = arbneg & s_mask[mask].mask;
661 if (tmp) {
662 tmp = tmp ^ s_mask[mask].mask;
663 if (tmp) {
664 emit_arith(cs,
665 PFS_OP_MAD,
666 *r,
667 arbneg & s_mask[mask].mask,
668 keep(src) | REG_NEGV_MASK,
669 pfs_one, pfs_zero, 0);
670 if (!wmask) {
671 REG_SET_NO_USE(src, GL_TRUE);
672 } else {
673 REG_SET_NO_USE(src, GL_FALSE);
674 }
675 emit_arith(cs,
676 PFS_OP_MAD,
677 *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
678 } else {
679 if (!wmask) {
680 REG_SET_NO_USE(src, GL_TRUE);
681 } else {
682 REG_SET_NO_USE(src, GL_FALSE);
683 }
684 emit_arith(cs,
685 PFS_OP_MAD,
686 *r,
687 (arbneg & s_mask[mask].mask) | wmask,
688 src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
689 }
690 } else {
691 if (!wmask) {
692 REG_SET_NO_USE(src, GL_TRUE);
693 } else {
694 REG_SET_NO_USE(src, GL_FALSE);
695 }
696 emit_arith(cs, PFS_OP_MAD,
697 *r,
698 s_mask[mask].mask | wmask,
699 src, pfs_one, pfs_zero, 0);
700 }
701
702 return s_mask[mask].count;
703 }
704
705 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
706 GLuint src, GLuint arbswz, GLuint arbneg)
707 {
708 COMPILE_STATE;
709 GLuint r = undef;
710 GLuint vswz;
711 int c_mask = 0;
712 int v_match = 0;
713
714 /* If swizzling from something without an XYZW native swizzle,
715 * emit result to a temp, and do new swizzle from the temp.
716 */
717 #if 0
718 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
719 GLuint temp = get_temp_reg(fp);
720 emit_arith(fp,
721 PFS_OP_MAD,
722 temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
723 src = temp;
724 }
725 #endif
726
727 if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
728 GLuint vsrcswz =
729 (v_swiz[REG_GET_VSWZ(src)].
730 hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
731 REG_GET_SSWZ(src) << 9;
732 GLint i;
733
734 GLuint newswz = 0;
735 GLuint offset;
736 for (i = 0; i < 4; ++i) {
737 offset = GET_SWZ(arbswz, i);
738
739 newswz |=
740 (offset <= 3) ? GET_SWZ(vsrcswz,
741 offset) << i *
742 3 : offset << i * 3;
743 }
744
745 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
746 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
747 } else {
748 /* set scalar swizzling */
749 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
750
751 }
752 do {
753 vswz = REG_GET_VSWZ(src);
754 do {
755 int chash;
756
757 REG_SET_VSWZ(src, vswz);
758 chash = v_swiz[REG_GET_VSWZ(src)].hash &
759 s_mask[c_mask].hash;
760
761 if (chash == (arbswz & s_mask[c_mask].hash)) {
762 if (s_mask[c_mask].count == 3) {
763 v_match += swz_native(cs,
764 src, &r, arbneg);
765 } else {
766 v_match += swz_emit_partial(cs,
767 src,
768 &r,
769 c_mask,
770 v_match,
771 arbneg);
772 }
773
774 if (v_match == 3)
775 return r;
776
777 /* Fill with something invalid.. all 0's was
778 * wrong before, matched SWIZZLE_X. So all
779 * 1's will be okay for now
780 */
781 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
782 }
783 } while (v_swiz[++vswz].hash != PFS_INVAL);
784 REG_SET_VSWZ(src, SWIZZLE_XYZ);
785 } while (s_mask[++c_mask].hash != PFS_INVAL);
786
787 ERROR("should NEVER get here\n");
788 return r;
789 }
790
791 static GLuint t_src(struct r300_pfs_compile_state *cs,
792 struct prog_src_register fpsrc)
793 {
794 COMPILE_STATE;
795 GLuint r = undef;
796
797 switch (fpsrc.File) {
798 case PROGRAM_TEMPORARY:
799 REG_SET_INDEX(r, fpsrc.Index);
800 REG_SET_VALID(r, GL_TRUE);
801 REG_SET_TYPE(r, REG_TYPE_TEMP);
802 break;
803 case PROGRAM_INPUT:
804 REG_SET_INDEX(r, fpsrc.Index);
805 REG_SET_VALID(r, GL_TRUE);
806 REG_SET_TYPE(r, REG_TYPE_INPUT);
807 break;
808 case PROGRAM_LOCAL_PARAM:
809 r = emit_const4fv(cs,
810 fp->mesa_program.Base.LocalParams[fpsrc.
811 Index]);
812 break;
813 case PROGRAM_ENV_PARAM:
814 r = emit_const4fv(cs,
815 cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
816 break;
817 case PROGRAM_STATE_VAR:
818 case PROGRAM_NAMED_PARAM:
819 case PROGRAM_CONSTANT:
820 r = emit_const4fv(cs,
821 fp->mesa_program.Base.Parameters->
822 ParameterValues[fpsrc.Index]);
823 break;
824 case PROGRAM_BUILTIN:
825 switch(fpsrc.Swizzle) {
826 case SWIZZLE_1111: r = pfs_one; break;
827 case SWIZZLE_0000: r = pfs_zero; break;
828 default:
829 ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
830 break;
831 }
832 break;
833 default:
834 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
835 return r;
836 }
837
838 /* no point swizzling ONE/ZERO/HALF constants... */
839 if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
840 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
841 if (fpsrc.Abs)
842 r = absolute(r);
843 if (fpsrc.NegateAbs)
844 r = negate(r);
845 return r;
846 }
847
848 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
849 struct prog_src_register fpsrc)
850 {
851 struct prog_src_register src = fpsrc;
852 int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
853
854 src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
855
856 return t_src(cs, src);
857 }
858
859 static GLuint t_dst(struct r300_pfs_compile_state *cs,
860 struct prog_dst_register dest)
861 {
862 COMPILE_STATE;
863 GLuint r = undef;
864
865 switch (dest.File) {
866 case PROGRAM_TEMPORARY:
867 REG_SET_INDEX(r, dest.Index);
868 REG_SET_VALID(r, GL_TRUE);
869 REG_SET_TYPE(r, REG_TYPE_TEMP);
870 return r;
871 case PROGRAM_OUTPUT:
872 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
873 switch (dest.Index) {
874 case FRAG_RESULT_COLR:
875 case FRAG_RESULT_DEPR:
876 REG_SET_INDEX(r, dest.Index);
877 REG_SET_VALID(r, GL_TRUE);
878 return r;
879 default:
880 ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
881 return r;
882 }
883 default:
884 ERROR("Bad DstReg->File 0x%x\n", dest.File);
885 return r;
886 }
887 }
888
889 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
890 {
891 COMPILE_STATE;
892 int idx;
893 int index = REG_GET_INDEX(src);
894
895 switch (REG_GET_TYPE(src)) {
896 case REG_TYPE_TEMP:
897 /* NOTE: if reg==-1 here, a source is being read that
898 * hasn't been written to. Undefined results.
899 */
900 if (cs->temps[index].reg == -1)
901 cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
902
903 idx = cs->temps[index].reg;
904
905 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
906 free_temp(cs, src);
907 break;
908 case REG_TYPE_INPUT:
909 idx = cs->inputs[index].reg;
910
911 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
912 free_hw_temp(cs, cs->inputs[index].reg);
913 break;
914 case REG_TYPE_CONST:
915 return (index | SRC_CONST);
916 default:
917 ERROR("Invalid type for source reg\n");
918 return (0 | SRC_CONST);
919 }
920
921 if (!tex)
922 cs->used_in_node |= (1 << idx);
923
924 return idx;
925 }
926
927 static int t_hw_dst(struct r300_pfs_compile_state *cs,
928 GLuint dest, GLboolean tex, int slot)
929 {
930 COMPILE_STATE;
931 int idx;
932 GLuint index = REG_GET_INDEX(dest);
933 assert(REG_GET_VALID(dest));
934
935 switch (REG_GET_TYPE(dest)) {
936 case REG_TYPE_TEMP:
937 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
938 if (!tex) {
939 cs->temps[index].reg = get_hw_temp(cs, slot);
940 } else {
941 cs->temps[index].reg = get_hw_temp_tex(cs);
942 }
943 }
944 idx = cs->temps[index].reg;
945
946 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
947 free_temp(cs, dest);
948
949 cs->dest_in_node |= (1 << idx);
950 cs->used_in_node |= (1 << idx);
951 break;
952 case REG_TYPE_OUTPUT:
953 switch (index) {
954 case FRAG_RESULT_COLR:
955 code->node[code->cur_node].flags |= R300_RGBA_OUT;
956 break;
957 case FRAG_RESULT_DEPR:
958 fp->WritesDepth = GL_TRUE;
959 code->node[code->cur_node].flags |= R300_W_OUT;
960 break;
961 }
962 return index;
963 break;
964 default:
965 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
966 return 0;
967 }
968
969 return idx;
970 }
971
972 static void emit_nop(struct r300_pfs_compile_state *cs)
973 {
974 COMPILE_STATE;
975
976 if (cs->nrslots >= PFS_MAX_ALU_INST) {
977 ERROR("Out of ALU instruction slots\n");
978 return;
979 }
980
981 code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
982 code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
983 code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
984 code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
985 cs->nrslots++;
986 }
987
988 static void emit_tex(struct r300_pfs_compile_state *cs,
989 struct prog_instruction *fpi, int opcode)
990 {
991 COMPILE_STATE;
992 GLuint coord = t_src(cs, fpi->SrcReg[0]);
993 GLuint dest = undef;
994 GLuint din, uin;
995 int unit = fpi->TexSrcUnit;
996 int hwsrc, hwdest;
997
998 /* Ensure correct node indirection */
999 uin = cs->used_in_node;
1000 din = cs->dest_in_node;
1001
1002 /* Resolve source/dest to hardware registers */
1003 hwsrc = t_hw_src(cs, coord, GL_TRUE);
1004
1005 if (opcode != R300_TEX_OP_KIL) {
1006 dest = t_dst(cs, fpi->DstReg);
1007
1008 hwdest =
1009 t_hw_dst(cs, dest, GL_TRUE,
1010 code->node[code->cur_node].alu_offset);
1011
1012 /* Use a temp that hasn't been used in this node, rather
1013 * than causing an indirection
1014 */
1015 if (uin & (1 << hwdest)) {
1016 free_hw_temp(cs, hwdest);
1017 hwdest = get_hw_temp_tex(cs);
1018 cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1019 }
1020 } else {
1021 hwdest = 0;
1022 unit = 0;
1023 }
1024
1025 /* Indirection if source has been written in this node, or if the
1026 * dest has been read/written in this node
1027 */
1028 if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1029 (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1030
1031 /* Finish off current node */
1032 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1033 emit_nop(cs);
1034
1035 code->node[code->cur_node].alu_end =
1036 cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1037 assert(code->node[code->cur_node].alu_end >= 0);
1038
1039 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1040 ERROR("too many levels of texture indirection\n");
1041 return;
1042 }
1043
1044 /* Start new node */
1045 code->node[code->cur_node].tex_offset = code->tex.length;
1046 code->node[code->cur_node].alu_offset = cs->nrslots;
1047 code->node[code->cur_node].tex_end = -1;
1048 code->node[code->cur_node].alu_end = -1;
1049 code->node[code->cur_node].flags = 0;
1050 cs->used_in_node = 0;
1051 cs->dest_in_node = 0;
1052 }
1053
1054 if (code->cur_node == 0)
1055 code->first_node_has_tex = 1;
1056
1057 code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1058 | (hwdest << R300_DST_ADDR_SHIFT)
1059 | (unit << R300_TEX_ID_SHIFT)
1060 | (opcode << R300_TEX_INST_SHIFT);
1061
1062 cs->dest_in_node |= (1 << hwdest);
1063 if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1064 cs->used_in_node |= (1 << hwsrc);
1065
1066 code->node[code->cur_node].tex_end++;
1067 }
1068
1069 /**
1070 * Returns the first slot where we could possibly allow writing to dest,
1071 * according to register allocation.
1072 */
1073 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1074 GLuint dest, int mask)
1075 {
1076 COMPILE_STATE;
1077 int idx;
1078 int pos;
1079 GLuint index = REG_GET_INDEX(dest);
1080 assert(REG_GET_VALID(dest));
1081
1082 switch (REG_GET_TYPE(dest)) {
1083 case REG_TYPE_TEMP:
1084 if (cs->temps[index].reg == -1)
1085 return 0;
1086
1087 idx = cs->temps[index].reg;
1088 break;
1089 case REG_TYPE_OUTPUT:
1090 return 0;
1091 default:
1092 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1093 return 0;
1094 }
1095
1096 pos = cs->hwtemps[idx].reserved;
1097 if (mask & WRITEMASK_XYZ) {
1098 if (pos < cs->hwtemps[idx].vector_lastread)
1099 pos = cs->hwtemps[idx].vector_lastread;
1100 }
1101 if (mask & WRITEMASK_W) {
1102 if (pos < cs->hwtemps[idx].scalar_lastread)
1103 pos = cs->hwtemps[idx].scalar_lastread;
1104 }
1105
1106 return pos;
1107 }
1108
1109 /**
1110 * Allocates a slot for an ALU instruction that can consist of
1111 * a vertex part or a scalar part or both.
1112 *
1113 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1114 * appropriate position (vector and/or scalar), and their positions are
1115 * recorded in the srcpos array.
1116 *
1117 * This function emits instruction code for the source fetch and the
1118 * argument selection. It does not emit instruction code for the
1119 * opcode or the destination selection.
1120 *
1121 * @return the index of the slot
1122 */
1123 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1124 GLboolean emit_vop,
1125 GLboolean emit_sop,
1126 int argc, GLuint * src, GLuint dest, int mask)
1127 {
1128 COMPILE_STATE;
1129 int hwsrc[3];
1130 int srcpos[3];
1131 unsigned int used;
1132 int tempused;
1133 int tempvsrc[3];
1134 int tempssrc[3];
1135 int pos;
1136 int regnr;
1137 int i, j;
1138
1139 // Determine instruction slots, whether sources are required on
1140 // vector or scalar side, and the smallest slot number where
1141 // all source registers are available
1142 used = 0;
1143 if (emit_vop)
1144 used |= SLOT_OP_VECTOR;
1145 if (emit_sop)
1146 used |= SLOT_OP_SCALAR;
1147
1148 pos = get_earliest_allowed_write(cs, dest, mask);
1149
1150 if (code->node[code->cur_node].alu_offset > pos)
1151 pos = code->node[code->cur_node].alu_offset;
1152 for (i = 0; i < argc; ++i) {
1153 if (!REG_GET_BUILTIN(src[i])) {
1154 if (emit_vop)
1155 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1156 if (emit_sop)
1157 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1158 }
1159
1160 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1161 regnr = hwsrc[i] & 31;
1162
1163 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1164 if (used & (SLOT_SRC_VECTOR << i)) {
1165 if (cs->hwtemps[regnr].vector_valid > pos)
1166 pos = cs->hwtemps[regnr].vector_valid;
1167 }
1168 if (used & (SLOT_SRC_SCALAR << i)) {
1169 if (cs->hwtemps[regnr].scalar_valid > pos)
1170 pos = cs->hwtemps[regnr].scalar_valid;
1171 }
1172 }
1173 }
1174
1175 // Find a slot that fits
1176 for (;; ++pos) {
1177 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1178 continue;
1179
1180 if (pos >= cs->nrslots) {
1181 if (cs->nrslots >= PFS_MAX_ALU_INST) {
1182 ERROR("Out of ALU instruction slots\n");
1183 return -1;
1184 }
1185
1186 code->alu.inst[pos].inst0 = NOP_INST0;
1187 code->alu.inst[pos].inst1 = NOP_INST1;
1188 code->alu.inst[pos].inst2 = NOP_INST2;
1189 code->alu.inst[pos].inst3 = NOP_INST3;
1190
1191 cs->nrslots++;
1192 }
1193 // Note: When we need both parts (vector and scalar) of a source,
1194 // we always try to put them into the same position. This makes the
1195 // code easier to read, and it is optimal (i.e. one doesn't gain
1196 // anything by splitting the parts).
1197 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1198 tempused = cs->slot[pos].used;
1199 for (i = 0; i < 3; ++i) {
1200 tempvsrc[i] = cs->slot[pos].vsrc[i];
1201 tempssrc[i] = cs->slot[pos].ssrc[i];
1202 }
1203
1204 for (i = 0; i < argc; ++i) {
1205 int flags = (used >> i) & SLOT_SRC_BOTH;
1206
1207 if (!flags) {
1208 srcpos[i] = 0;
1209 continue;
1210 }
1211
1212 for (j = 0; j < 3; ++j) {
1213 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1214 if (tempvsrc[j] != hwsrc[i])
1215 continue;
1216 }
1217
1218 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1219 if (tempssrc[j] != hwsrc[i])
1220 continue;
1221 }
1222
1223 break;
1224 }
1225
1226 if (j == 3)
1227 break;
1228
1229 srcpos[i] = j;
1230 tempused |= flags << j;
1231 if (flags & SLOT_SRC_VECTOR)
1232 tempvsrc[j] = hwsrc[i];
1233 if (flags & SLOT_SRC_SCALAR)
1234 tempssrc[j] = hwsrc[i];
1235 }
1236
1237 if (i == argc)
1238 break;
1239 }
1240
1241 // Found a slot, reserve it
1242 cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1243 for (i = 0; i < 3; ++i) {
1244 cs->slot[pos].vsrc[i] = tempvsrc[i];
1245 cs->slot[pos].ssrc[i] = tempssrc[i];
1246 }
1247
1248 for (i = 0; i < argc; ++i) {
1249 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1250 int regnr = hwsrc[i] & 31;
1251
1252 if (used & (SLOT_SRC_VECTOR << i)) {
1253 if (cs->hwtemps[regnr].vector_lastread < pos)
1254 cs->hwtemps[regnr].vector_lastread =
1255 pos;
1256 }
1257 if (used & (SLOT_SRC_SCALAR << i)) {
1258 if (cs->hwtemps[regnr].scalar_lastread < pos)
1259 cs->hwtemps[regnr].scalar_lastread =
1260 pos;
1261 }
1262 }
1263 }
1264
1265 // Emit the source fetch code
1266 code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1267 code->alu.inst[pos].inst1 |=
1268 ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1269 (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1270 (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1271
1272 code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1273 code->alu.inst[pos].inst3 |=
1274 ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1275 (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1276 (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1277
1278 // Emit the argument selection code
1279 if (emit_vop) {
1280 int swz[3];
1281
1282 for (i = 0; i < 3; ++i) {
1283 if (i < argc) {
1284 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1285 (srcpos[i] *
1286 v_swiz[REG_GET_VSWZ(src[i])].
1287 stride)) | ((src[i] & REG_NEGV_MASK)
1288 ? ARG_NEG : 0) | ((src[i]
1289 &
1290 REG_ABS_MASK)
1291 ?
1292 ARG_ABS
1293 : 0);
1294 } else {
1295 swz[i] = R300_ALU_ARGC_ZERO;
1296 }
1297 }
1298
1299 code->alu.inst[pos].inst0 &=
1300 ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1301 R300_ALU_ARG2C_MASK);
1302 code->alu.inst[pos].inst0 |=
1303 (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1304 R300_ALU_ARG1C_SHIFT)
1305 | (swz[2] << R300_ALU_ARG2C_SHIFT);
1306 }
1307
1308 if (emit_sop) {
1309 int swz[3];
1310
1311 for (i = 0; i < 3; ++i) {
1312 if (i < argc) {
1313 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1314 (srcpos[i] *
1315 s_swiz[REG_GET_SSWZ(src[i])].
1316 stride)) | ((src[i] & REG_NEGS_MASK)
1317 ? ARG_NEG : 0) | ((src[i]
1318 &
1319 REG_ABS_MASK)
1320 ?
1321 ARG_ABS
1322 : 0);
1323 } else {
1324 swz[i] = R300_ALU_ARGA_ZERO;
1325 }
1326 }
1327
1328 code->alu.inst[pos].inst2 &=
1329 ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1330 R300_ALU_ARG2A_MASK);
1331 code->alu.inst[pos].inst2 |=
1332 (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1333 R300_ALU_ARG1A_SHIFT)
1334 | (swz[2] << R300_ALU_ARG2A_SHIFT);
1335 }
1336
1337 return pos;
1338 }
1339
1340 /**
1341 * Append an ALU instruction to the instruction list.
1342 */
1343 static void emit_arith(struct r300_pfs_compile_state *cs,
1344 int op,
1345 GLuint dest,
1346 int mask,
1347 GLuint src0, GLuint src1, GLuint src2, int flags)
1348 {
1349 COMPILE_STATE;
1350 GLuint src[3] = { src0, src1, src2 };
1351 int hwdest;
1352 GLboolean emit_vop, emit_sop;
1353 int vop, sop, argc;
1354 int pos;
1355
1356 vop = r300_fpop[op].v_op;
1357 sop = r300_fpop[op].s_op;
1358 argc = r300_fpop[op].argc;
1359
1360 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1361 REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1362 if (mask & WRITEMASK_Z) {
1363 mask = WRITEMASK_W;
1364 } else {
1365 return;
1366 }
1367 }
1368
1369 emit_vop = GL_FALSE;
1370 emit_sop = GL_FALSE;
1371 if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1372 emit_vop = GL_TRUE;
1373 if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1374 emit_sop = GL_TRUE;
1375
1376 pos =
1377 find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1378 mask);
1379 if (pos < 0)
1380 return;
1381
1382 hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1383
1384 if (flags & PFS_FLAG_SAT) {
1385 vop |= R300_ALU_OUTC_CLAMP;
1386 sop |= R300_ALU_OUTA_CLAMP;
1387 }
1388
1389 /* Throw the pieces together and get ALU/1 */
1390 if (emit_vop) {
1391 code->alu.inst[pos].inst0 |= vop;
1392
1393 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1394
1395 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1396 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1397 code->alu.inst[pos].inst1 |=
1398 (mask & WRITEMASK_XYZ) <<
1399 R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1400 } else
1401 assert(0);
1402 } else {
1403 code->alu.inst[pos].inst1 |=
1404 (mask & WRITEMASK_XYZ) <<
1405 R300_ALU_DSTC_REG_MASK_SHIFT;
1406
1407 cs->hwtemps[hwdest].vector_valid = pos + 1;
1408 }
1409 }
1410
1411 /* And now ALU/3 */
1412 if (emit_sop) {
1413 code->alu.inst[pos].inst2 |= sop;
1414
1415 if (mask & WRITEMASK_W) {
1416 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1417 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1418 code->alu.inst[pos].inst3 |=
1419 (hwdest << R300_ALU_DSTA_SHIFT) |
1420 R300_ALU_DSTA_OUTPUT;
1421 } else if (REG_GET_INDEX(dest) ==
1422 FRAG_RESULT_DEPR) {
1423 code->alu.inst[pos].inst3 |=
1424 R300_ALU_DSTA_DEPTH;
1425 } else
1426 assert(0);
1427 } else {
1428 code->alu.inst[pos].inst3 |=
1429 (hwdest << R300_ALU_DSTA_SHIFT) |
1430 R300_ALU_DSTA_REG;
1431
1432 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1433 }
1434 }
1435 }
1436
1437 return;
1438 }
1439
1440 static GLfloat SinCosConsts[2][4] = {
1441 {
1442 1.273239545, // 4/PI
1443 -0.405284735, // -4/(PI*PI)
1444 3.141592654, // PI
1445 0.2225 // weight
1446 },
1447 {
1448 0.75,
1449 0.0,
1450 0.159154943, // 1/(2*PI)
1451 6.283185307 // 2*PI
1452 }
1453 };
1454
1455 /**
1456 * Emit a LIT instruction.
1457 * \p flags may be PFS_FLAG_SAT
1458 *
1459 * Definition of LIT (from ARB_fragment_program):
1460 * tmp = VectorLoad(op0);
1461 * if (tmp.x < 0) tmp.x = 0;
1462 * if (tmp.y < 0) tmp.y = 0;
1463 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1464 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1465 * result.x = 1.0;
1466 * result.y = tmp.x;
1467 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1468 * result.w = 1.0;
1469 *
1470 * The longest path of computation is the one leading to result.z,
1471 * consisting of 5 operations. This implementation of LIT takes
1472 * 5 slots. So unless there's some special undocumented opcode,
1473 * this implementation is potentially optimal. Unfortunately,
1474 * emit_arith is a bit too conservative because it doesn't understand
1475 * partial writes to the vector component.
1476 */
1477 static const GLfloat LitConst[4] =
1478 { 127.999999, 127.999999, 127.999999, -127.999999 };
1479
1480 static void emit_lit(struct r300_pfs_compile_state *cs,
1481 GLuint dest, int mask, GLuint src, int flags)
1482 {
1483 COMPILE_STATE;
1484 GLuint cnst;
1485 int needTemporary;
1486 GLuint temp;
1487
1488 cnst = emit_const4fv(cs, LitConst);
1489
1490 needTemporary = 0;
1491 if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1492 needTemporary = 1;
1493 } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1494 // LIT is typically followed by DP3/DP4, so there's no point
1495 // in creating special code for this case
1496 needTemporary = 1;
1497 }
1498
1499 if (needTemporary) {
1500 temp = keep(get_temp_reg(cs));
1501 } else {
1502 temp = keep(dest);
1503 }
1504
1505 // Note: The order of emit_arith inside the slots is relevant,
1506 // because emit_arith only looks at scalar vs. vector when resolving
1507 // dependencies, and it does not consider individual vector components,
1508 // so swizzling between the two parts can create fake dependencies.
1509
1510 // First slot
1511 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1512 keep(src), pfs_zero, undef, 0);
1513 emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1514
1515 // Second slot
1516 emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1517 swizzle(temp, W, W, W, W), cnst, undef, 0);
1518 emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1519 swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1520
1521 // Third slot
1522 // If desired, we saturate the y result here.
1523 // This does not affect the use as a condition variable in the CMP later
1524 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1525 temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1526 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1527 swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1528
1529 // Fourth slot
1530 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1531 pfs_one, pfs_one, pfs_zero, 0);
1532 emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1533
1534 // Fifth slot
1535 emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1536 pfs_zero, swizzle(temp, W, W, W, W),
1537 negate(swizzle(temp, Y, Y, Y, Y)), flags);
1538 emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1539 pfs_zero, 0);
1540
1541 if (needTemporary) {
1542 emit_arith(cs, PFS_OP_MAD, dest, mask,
1543 temp, pfs_one, pfs_zero, flags);
1544 free_temp(cs, temp);
1545 } else {
1546 // Decrease refcount of the destination
1547 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1548 }
1549 }
1550
1551 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
1552 {
1553 COMPILE_STATE;
1554 GLuint src[3], dest, temp[2];
1555 int flags, mask = 0;
1556 int const_sin[2];
1557
1558 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1559 flags = PFS_FLAG_SAT;
1560 else
1561 flags = 0;
1562
1563 if (fpi->Opcode != OPCODE_KIL) {
1564 dest = t_dst(cs, fpi->DstReg);
1565 mask = fpi->DstReg.WriteMask;
1566 }
1567
1568 switch (fpi->Opcode) {
1569 case OPCODE_ADD:
1570 src[0] = t_src(cs, fpi->SrcReg[0]);
1571 src[1] = t_src(cs, fpi->SrcReg[1]);
1572 emit_arith(cs, PFS_OP_MAD, dest, mask,
1573 src[0], pfs_one, src[1], flags);
1574 break;
1575 case OPCODE_CMP:
1576 src[0] = t_src(cs, fpi->SrcReg[0]);
1577 src[1] = t_src(cs, fpi->SrcReg[1]);
1578 src[2] = t_src(cs, fpi->SrcReg[2]);
1579 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1580 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1581 */
1582 emit_arith(cs, PFS_OP_CMP, dest, mask,
1583 src[2], src[1], src[0], flags);
1584 break;
1585 case OPCODE_COS:
1586 /*
1587 * cos using a parabola (see SIN):
1588 * cos(x):
1589 * x = (x/(2*PI))+0.75
1590 * x = frac(x)
1591 * x = (x*2*PI)-PI
1592 * result = sin(x)
1593 */
1594 temp[0] = get_temp_reg(cs);
1595 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1596 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1597 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1598
1599 /* add 0.5*PI and do range reduction */
1600
1601 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1602 swizzle(src[0], X, X, X, X),
1603 swizzle(const_sin[1], Z, Z, Z, Z),
1604 swizzle(const_sin[1], X, X, X, X), 0);
1605
1606 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1607 swizzle(temp[0], X, X, X, X),
1608 undef, undef, 0);
1609
1610 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
1611 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
1612 0);
1613
1614 /* SIN */
1615
1616 emit_arith(cs, PFS_OP_MAD, temp[0],
1617 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1618 Z, Z, Z,
1619 Z),
1620 const_sin[0], pfs_zero, 0);
1621
1622 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1623 swizzle(temp[0], Y, Y, Y, Y),
1624 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1625 swizzle(temp[0], X, X, X, X), 0);
1626
1627 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1628 swizzle(temp[0], X, X, X, X),
1629 absolute(swizzle(temp[0], X, X, X, X)),
1630 negate(swizzle(temp[0], X, X, X, X)), 0);
1631
1632 emit_arith(cs, PFS_OP_MAD, dest, mask,
1633 swizzle(temp[0], Y, Y, Y, Y),
1634 swizzle(const_sin[0], W, W, W, W),
1635 swizzle(temp[0], X, X, X, X), flags);
1636
1637 free_temp(cs, temp[0]);
1638 break;
1639 case OPCODE_DP3:
1640 src[0] = t_src(cs, fpi->SrcReg[0]);
1641 src[1] = t_src(cs, fpi->SrcReg[1]);
1642 emit_arith(cs, PFS_OP_DP3, dest, mask,
1643 src[0], src[1], undef, flags);
1644 break;
1645 case OPCODE_DP4:
1646 src[0] = t_src(cs, fpi->SrcReg[0]);
1647 src[1] = t_src(cs, fpi->SrcReg[1]);
1648 emit_arith(cs, PFS_OP_DP4, dest, mask,
1649 src[0], src[1], undef, flags);
1650 break;
1651 case OPCODE_DST:
1652 src[0] = t_src(cs, fpi->SrcReg[0]);
1653 src[1] = t_src(cs, fpi->SrcReg[1]);
1654 /* dest.y = src0.y * src1.y */
1655 if (mask & WRITEMASK_Y)
1656 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1657 keep(src[0]), keep(src[1]),
1658 pfs_zero, flags);
1659 /* dest.z = src0.z */
1660 if (mask & WRITEMASK_Z)
1661 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1662 src[0], pfs_one, pfs_zero, flags);
1663 /* result.x = 1.0
1664 * result.w = src1.w */
1665 if (mask & WRITEMASK_XW) {
1666 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */
1667 emit_arith(cs, PFS_OP_MAD, dest,
1668 mask & WRITEMASK_XW,
1669 src[1], pfs_one, pfs_zero, flags);
1670 }
1671 break;
1672 case OPCODE_EX2:
1673 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1674 emit_arith(cs, PFS_OP_EX2, dest, mask,
1675 src[0], undef, undef, flags);
1676 break;
1677 case OPCODE_FRC:
1678 src[0] = t_src(cs, fpi->SrcReg[0]);
1679 emit_arith(cs, PFS_OP_FRC, dest, mask,
1680 src[0], undef, undef, flags);
1681 break;
1682 case OPCODE_KIL:
1683 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1684 break;
1685 case OPCODE_LG2:
1686 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1687 emit_arith(cs, PFS_OP_LG2, dest, mask,
1688 src[0], undef, undef, flags);
1689 break;
1690 case OPCODE_LIT:
1691 src[0] = t_src(cs, fpi->SrcReg[0]);
1692 emit_lit(cs, dest, mask, src[0], flags);
1693 break;
1694 case OPCODE_LRP:
1695 src[0] = t_src(cs, fpi->SrcReg[0]);
1696 src[1] = t_src(cs, fpi->SrcReg[1]);
1697 src[2] = t_src(cs, fpi->SrcReg[2]);
1698 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1699 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1700 * MAD temp, -tmp0, tmp2, tmp2
1701 * MAD result, tmp0, tmp1, temp
1702 */
1703 temp[0] = get_temp_reg(cs);
1704 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1705 negate(keep(src[0])), keep(src[2]), src[2],
1706 0);
1707 emit_arith(cs, PFS_OP_MAD, dest, mask,
1708 src[0], src[1], temp[0], flags);
1709 free_temp(cs, temp[0]);
1710 break;
1711 case OPCODE_MAD:
1712 src[0] = t_src(cs, fpi->SrcReg[0]);
1713 src[1] = t_src(cs, fpi->SrcReg[1]);
1714 src[2] = t_src(cs, fpi->SrcReg[2]);
1715 emit_arith(cs, PFS_OP_MAD, dest, mask,
1716 src[0], src[1], src[2], flags);
1717 break;
1718 case OPCODE_MAX:
1719 src[0] = t_src(cs, fpi->SrcReg[0]);
1720 src[1] = t_src(cs, fpi->SrcReg[1]);
1721 emit_arith(cs, PFS_OP_MAX, dest, mask,
1722 src[0], src[1], undef, flags);
1723 break;
1724 case OPCODE_MIN:
1725 src[0] = t_src(cs, fpi->SrcReg[0]);
1726 src[1] = t_src(cs, fpi->SrcReg[1]);
1727 emit_arith(cs, PFS_OP_MIN, dest, mask,
1728 src[0], src[1], undef, flags);
1729 break;
1730 case OPCODE_MOV:
1731 src[0] = t_src(cs, fpi->SrcReg[0]);
1732 emit_arith(cs, PFS_OP_MAD, dest, mask,
1733 src[0], pfs_one, pfs_zero, flags);
1734 break;
1735 case OPCODE_MUL:
1736 src[0] = t_src(cs, fpi->SrcReg[0]);
1737 src[1] = t_src(cs, fpi->SrcReg[1]);
1738 emit_arith(cs, PFS_OP_MAD, dest, mask,
1739 src[0], src[1], pfs_zero, flags);
1740 break;
1741 case OPCODE_RCP:
1742 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1743 emit_arith(cs, PFS_OP_RCP, dest, mask,
1744 src[0], undef, undef, flags);
1745 break;
1746 case OPCODE_RSQ:
1747 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1748 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1749 absolute(src[0]), pfs_zero, pfs_zero, flags);
1750 break;
1751 case OPCODE_SCS:
1752 /*
1753 * scs using a parabola :
1754 * scs(x):
1755 * result.x = sin(-abs(x)+0.5*PI) (cos)
1756 * result.y = sin(x) (sin)
1757 *
1758 */
1759 temp[0] = get_temp_reg(cs);
1760 temp[1] = get_temp_reg(cs);
1761 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1762 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1763 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1764
1765 /* x = -abs(x)+0.5*PI */
1766 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI
1767 pfs_half,
1768 negate(abs
1769 (swizzle(keep(src[0]), X, X, X, X))),
1770 0);
1771
1772 /* C*x (sin) */
1773 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1774 swizzle(const_sin[0], Y, Y, Y, Y),
1775 swizzle(keep(src[0]), X, X, X, X),
1776 pfs_zero, 0);
1777
1778 /* B*x, C*x (cos) */
1779 emit_arith(cs, PFS_OP_MAD, temp[0],
1780 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1781 Z, Z, Z,
1782 Z),
1783 const_sin[0], pfs_zero, 0);
1784
1785 /* B*x (sin) */
1786 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1787 swizzle(const_sin[0], X, X, X, X),
1788 keep(src[0]), pfs_zero, 0);
1789
1790 /* y = B*x + C*x*abs(x) (sin) */
1791 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1792 absolute(src[0]),
1793 swizzle(temp[0], W, W, W, W),
1794 swizzle(temp[1], W, W, W, W), 0);
1795
1796 /* y = B*x + C*x*abs(x) (cos) */
1797 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1798 swizzle(temp[0], Y, Y, Y, Y),
1799 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1800 swizzle(temp[0], X, X, X, X), 0);
1801
1802 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1803 emit_arith(cs, PFS_OP_MAD, temp[0],
1804 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1805 W, Z, Y,
1806 X),
1807 absolute(swizzle(temp[1], W, Z, Y, X)),
1808 negate(swizzle(temp[1], W, Z, Y, X)), 0);
1809
1810 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1811 emit_arith(cs, PFS_OP_MAD, dest,
1812 mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1813 swizzle(const_sin[0], W, W, W, W),
1814 swizzle(temp[1], W, Z, Y, X), flags);
1815
1816 free_temp(cs, temp[0]);
1817 free_temp(cs, temp[1]);
1818 break;
1819 case OPCODE_SIN:
1820 /*
1821 * using a parabola:
1822 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1823 * extra precision is obtained by weighting against
1824 * itself squared.
1825 */
1826
1827 temp[0] = get_temp_reg(cs);
1828 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1829 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1830 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1831
1832 /* do range reduction */
1833
1834 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1835 swizzle(keep(src[0]), X, X, X, X),
1836 swizzle(const_sin[1], Z, Z, Z, Z),
1837 pfs_half, 0);
1838
1839 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1840 swizzle(temp[0], X, X, X, X),
1841 undef, undef, 0);
1842
1843 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
1844 negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
1845 0);
1846
1847 /* SIN */
1848
1849 emit_arith(cs, PFS_OP_MAD, temp[0],
1850 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1851 Z, Z, Z,
1852 Z),
1853 const_sin[0], pfs_zero, 0);
1854
1855 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1856 swizzle(temp[0], Y, Y, Y, Y),
1857 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1858 swizzle(temp[0], X, X, X, X), 0);
1859
1860 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1861 swizzle(temp[0], X, X, X, X),
1862 absolute(swizzle(temp[0], X, X, X, X)),
1863 negate(swizzle(temp[0], X, X, X, X)), 0);
1864
1865 emit_arith(cs, PFS_OP_MAD, dest, mask,
1866 swizzle(temp[0], Y, Y, Y, Y),
1867 swizzle(const_sin[0], W, W, W, W),
1868 swizzle(temp[0], X, X, X, X), flags);
1869
1870 free_temp(cs, temp[0]);
1871 break;
1872 case OPCODE_TEX:
1873 emit_tex(cs, fpi, R300_TEX_OP_LD);
1874 break;
1875 case OPCODE_TXB:
1876 emit_tex(cs, fpi, R300_TEX_OP_TXB);
1877 break;
1878 case OPCODE_TXP:
1879 emit_tex(cs, fpi, R300_TEX_OP_TXP);
1880 break;
1881 default:
1882 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1883 break;
1884 }
1885 }
1886
1887 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1888 {
1889 COMPILE_STATE;
1890 int clauseidx;
1891
1892 for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
1893 struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
1894 int ip;
1895
1896 for(ip = 0; ip < clause->NumInstructions; ++ip) {
1897 emit_instruction(cs, clause->Instructions + ip);
1898
1899 if (fp->error)
1900 return GL_FALSE;
1901 }
1902 }
1903
1904 return GL_TRUE;
1905 }
1906
1907
1908 /* - Init structures
1909 * - Determine what hwregs each input corresponds to
1910 */
1911 static void init_program(struct r300_pfs_compile_state *cs)
1912 {
1913 COMPILE_STATE;
1914 struct gl_fragment_program *mp = &fp->mesa_program;
1915 GLuint InputsRead = mp->Base.InputsRead;
1916 GLuint temps_used = 0; /* for fp->temps[] */
1917 int i, j;
1918
1919 /* New compile, reset tracking data */
1920 fp->optimization =
1921 driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
1922 fp->translated = GL_FALSE;
1923 fp->error = GL_FALSE;
1924 fp->WritesDepth = GL_FALSE;
1925 code->tex.length = 0;
1926 code->cur_node = 0;
1927 code->first_node_has_tex = 0;
1928 code->const_nr = 0;
1929 code->max_temp_idx = 0;
1930 code->node[0].alu_end = -1;
1931 code->node[0].tex_end = -1;
1932
1933 for (i = 0; i < PFS_MAX_ALU_INST; i++) {
1934 for (j = 0; j < 3; j++) {
1935 cs->slot[i].vsrc[j] = SRC_CONST;
1936 cs->slot[i].ssrc[j] = SRC_CONST;
1937 }
1938 }
1939
1940 /* Work out what temps the Mesa inputs correspond to, this must match
1941 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1942 * configures itself based on the fragprog's InputsRead
1943 *
1944 * NOTE: this depends on get_hw_temp() allocating registers in order,
1945 * starting from register 0.
1946 */
1947
1948 /* Texcoords come first */
1949 for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
1950 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
1951 cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
1952 cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
1953 get_hw_temp(cs, 0);
1954 }
1955 }
1956 InputsRead &= ~FRAG_BITS_TEX_ANY;
1957
1958 /* fragment position treated as a texcoord */
1959 if (InputsRead & FRAG_BIT_WPOS) {
1960 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
1961 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
1962 }
1963 InputsRead &= ~FRAG_BIT_WPOS;
1964
1965 /* Then primary colour */
1966 if (InputsRead & FRAG_BIT_COL0) {
1967 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
1968 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
1969 }
1970 InputsRead &= ~FRAG_BIT_COL0;
1971
1972 /* Secondary color */
1973 if (InputsRead & FRAG_BIT_COL1) {
1974 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
1975 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
1976 }
1977 InputsRead &= ~FRAG_BIT_COL1;
1978
1979 /* Anything else */
1980 if (InputsRead) {
1981 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
1982 /* force read from hwreg 0 for now */
1983 for (i = 0; i < 32; i++)
1984 if (InputsRead & (1 << i))
1985 cs->inputs[i].reg = 0;
1986 }
1987
1988 /* Pre-parse the program, grabbing refcounts on input/temp regs.
1989 * That way, we can free up the reg when it's no longer needed
1990 */
1991 for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
1992 struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
1993 int idx;
1994
1995 for (j = 0; j < 3; j++) {
1996 idx = fpi->SrcReg[j].Index;
1997 switch (fpi->SrcReg[j].File) {
1998 case PROGRAM_TEMPORARY:
1999 if (!(temps_used & (1 << idx))) {
2000 cs->temps[idx].reg = -1;
2001 cs->temps[idx].refcount = 1;
2002 temps_used |= (1 << idx);
2003 } else
2004 cs->temps[idx].refcount++;
2005 break;
2006 case PROGRAM_INPUT:
2007 cs->inputs[idx].refcount++;
2008 break;
2009 default:
2010 break;
2011 }
2012 }
2013
2014 idx = fpi->DstReg.Index;
2015 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2016 if (!(temps_used & (1 << idx))) {
2017 cs->temps[idx].reg = -1;
2018 cs->temps[idx].refcount = 1;
2019 temps_used |= (1 << idx);
2020 } else
2021 cs->temps[idx].refcount++;
2022 }
2023 }
2024 cs->temp_in_use = temps_used;
2025 }
2026
2027
2028 /**
2029 * Final compilation step: Turn the intermediate radeon_program into
2030 * machine-readable instructions.
2031 */
2032 GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
2033 {
2034 struct r300_pfs_compile_state cs;
2035 struct r300_fragment_program_code *code = compiler->code;
2036
2037 _mesa_memset(&cs, 0, sizeof(cs));
2038 cs.compiler = compiler;
2039 init_program(&cs);
2040
2041 if (!parse_program(&cs))
2042 return GL_FALSE;
2043
2044 /* Finish off */
2045 code->node[code->cur_node].alu_end =
2046 cs.nrslots - code->node[code->cur_node].alu_offset - 1;
2047 if (code->node[code->cur_node].tex_end < 0)
2048 code->node[code->cur_node].tex_end = 0;
2049 code->alu_offset = 0;
2050 code->alu_end = cs.nrslots - 1;
2051 code->tex_offset = 0;
2052 code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
2053 assert(code->node[code->cur_node].alu_end >= 0);
2054 assert(code->alu_end >= 0);
2055
2056 return GL_TRUE;
2057 }
2058