r300g: implement TRUNC correctly
[mesa.git] / src / gallium / drivers / r300 / compiler / r3xx_vertprog.c
1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program.h"
32 #include "radeon_program_alu.h"
33 #include "radeon_swizzle.h"
34 #include "radeon_emulate_branches.h"
35 #include "radeon_emulate_loops.h"
36 #include "radeon_remove_constants.h"
37
38 /*
39 * Take an already-setup and valid source then swizzle it appropriately to
40 * obtain a constant ZERO or ONE source.
41 */
42 #define __CONST(x, y) \
43 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
44 t_swizzle(y), \
45 t_swizzle(y), \
46 t_swizzle(y), \
47 t_swizzle(y), \
48 t_src_class(vpi->SrcReg[x].File), \
49 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
50
51
52 static unsigned long t_dst_mask(unsigned int mask)
53 {
54 /* RC_MASK_* is equivalent to VSF_FLAG_* */
55 return mask & RC_MASK_XYZW;
56 }
57
58 static unsigned long t_dst_class(rc_register_file file)
59 {
60 switch (file) {
61 default:
62 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
63 /* fall-through */
64 case RC_FILE_TEMPORARY:
65 return PVS_DST_REG_TEMPORARY;
66 case RC_FILE_OUTPUT:
67 return PVS_DST_REG_OUT;
68 case RC_FILE_ADDRESS:
69 return PVS_DST_REG_A0;
70 }
71 }
72
73 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
74 struct rc_dst_register *dst)
75 {
76 if (dst->File == RC_FILE_OUTPUT)
77 return vp->outputs[dst->Index];
78
79 return dst->Index;
80 }
81
82 static unsigned long t_src_class(rc_register_file file)
83 {
84 switch (file) {
85 default:
86 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
87 /* fall-through */
88 case RC_FILE_NONE:
89 case RC_FILE_TEMPORARY:
90 return PVS_SRC_REG_TEMPORARY;
91 case RC_FILE_INPUT:
92 return PVS_SRC_REG_INPUT;
93 case RC_FILE_CONSTANT:
94 return PVS_SRC_REG_CONSTANT;
95 }
96 }
97
98 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
99 {
100 unsigned long aclass = t_src_class(a.File);
101 unsigned long bclass = t_src_class(b.File);
102
103 if (aclass != bclass)
104 return 0;
105 if (aclass == PVS_SRC_REG_TEMPORARY)
106 return 0;
107
108 if (a.RelAddr || b.RelAddr)
109 return 1;
110 if (a.Index != b.Index)
111 return 1;
112
113 return 0;
114 }
115
116 static inline unsigned long t_swizzle(unsigned int swizzle)
117 {
118 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
119 return swizzle;
120 }
121
122 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
123 struct rc_src_register *src)
124 {
125 if (src->File == RC_FILE_INPUT) {
126 assert(vp->inputs[src->Index] != -1);
127 return vp->inputs[src->Index];
128 } else {
129 if (src->Index < 0) {
130 fprintf(stderr,
131 "negative offsets for indirect addressing do not work.\n");
132 return 0;
133 }
134 return src->Index;
135 }
136 }
137
138 /* these two functions should probably be merged... */
139
140 static unsigned long t_src(struct r300_vertex_program_code *vp,
141 struct rc_src_register *src)
142 {
143 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
144 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
145 */
146 return PVS_SRC_OPERAND(t_src_index(vp, src),
147 t_swizzle(GET_SWZ(src->Swizzle, 0)),
148 t_swizzle(GET_SWZ(src->Swizzle, 1)),
149 t_swizzle(GET_SWZ(src->Swizzle, 2)),
150 t_swizzle(GET_SWZ(src->Swizzle, 3)),
151 t_src_class(src->File),
152 src->Negate) |
153 (src->RelAddr << 4) | (src->Abs << 3);
154 }
155
156 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
157 struct rc_src_register *src)
158 {
159 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
160 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
161 */
162 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
163
164 return PVS_SRC_OPERAND(t_src_index(vp, src),
165 t_swizzle(swz),
166 t_swizzle(swz),
167 t_swizzle(swz),
168 t_swizzle(swz),
169 t_src_class(src->File),
170 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
171 (src->RelAddr << 4) | (src->Abs << 3);
172 }
173
174 static int valid_dst(struct r300_vertex_program_code *vp,
175 struct rc_dst_register *dst)
176 {
177 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
178 return 0;
179 } else if (dst->File == RC_FILE_ADDRESS) {
180 assert(dst->Index == 0);
181 }
182
183 return 1;
184 }
185
186 static void ei_vector1(struct r300_vertex_program_code *vp,
187 unsigned int hw_opcode,
188 struct rc_sub_instruction *vpi,
189 unsigned int * inst)
190 {
191 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
192 0,
193 0,
194 t_dst_index(vp, &vpi->DstReg),
195 t_dst_mask(vpi->DstReg.WriteMask),
196 t_dst_class(vpi->DstReg.File));
197 inst[1] = t_src(vp, &vpi->SrcReg[0]);
198 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
199 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
200 }
201
202 static void ei_vector2(struct r300_vertex_program_code *vp,
203 unsigned int hw_opcode,
204 struct rc_sub_instruction *vpi,
205 unsigned int * inst)
206 {
207 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
208 0,
209 0,
210 t_dst_index(vp, &vpi->DstReg),
211 t_dst_mask(vpi->DstReg.WriteMask),
212 t_dst_class(vpi->DstReg.File));
213 inst[1] = t_src(vp, &vpi->SrcReg[0]);
214 inst[2] = t_src(vp, &vpi->SrcReg[1]);
215 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
216 }
217
218 static void ei_math1(struct r300_vertex_program_code *vp,
219 unsigned int hw_opcode,
220 struct rc_sub_instruction *vpi,
221 unsigned int * inst)
222 {
223 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
224 1,
225 0,
226 t_dst_index(vp, &vpi->DstReg),
227 t_dst_mask(vpi->DstReg.WriteMask),
228 t_dst_class(vpi->DstReg.File));
229 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
230 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
231 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
232 }
233
234 static void ei_lit(struct r300_vertex_program_code *vp,
235 struct rc_sub_instruction *vpi,
236 unsigned int * inst)
237 {
238 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
239
240 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
241 1,
242 0,
243 t_dst_index(vp, &vpi->DstReg),
244 t_dst_mask(vpi->DstReg.WriteMask),
245 t_dst_class(vpi->DstReg.File));
246 /* NOTE: Users swizzling might not work. */
247 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
248 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
249 PVS_SRC_SELECT_FORCE_0, // Z
250 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
251 t_src_class(vpi->SrcReg[0].File),
252 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
253 (vpi->SrcReg[0].RelAddr << 4);
254 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
255 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
256 PVS_SRC_SELECT_FORCE_0, // Z
257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
258 t_src_class(vpi->SrcReg[0].File),
259 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260 (vpi->SrcReg[0].RelAddr << 4);
261 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
262 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
263 PVS_SRC_SELECT_FORCE_0, // Z
264 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
265 t_src_class(vpi->SrcReg[0].File),
266 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
267 (vpi->SrcReg[0].RelAddr << 4);
268 }
269
270 static void ei_mad(struct r300_vertex_program_code *vp,
271 struct rc_sub_instruction *vpi,
272 unsigned int * inst)
273 {
274 unsigned int i;
275 /* Remarks about hardware limitations of MAD
276 * (please preserve this comment, as this information is _NOT_
277 * in the documentation provided by AMD).
278 *
279 * As described in the documentation, MAD with three unique temporary
280 * source registers requires the use of the macro version.
281 *
282 * However (and this is not mentioned in the documentation), apparently
283 * the macro version is _NOT_ a full superset of the normal version.
284 * In particular, the macro version does not always work when relative
285 * addressing is used in the source operands.
286 *
287 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
288 * assembly shader path when using medium quality animations
289 * (i.e. animations with matrix blending instead of quaternion blending).
290 *
291 * Unfortunately, I (nha) have been unable to extract a Piglit regression
292 * test for this issue - for some reason, it is possible to have vertex
293 * programs whose prefix is *exactly* the same as the prefix of the
294 * offending program in Sauerbraten up to the offending instruction
295 * without causing any trouble.
296 *
297 * Bottom line: Only use the macro version only when really necessary;
298 * according to AMD docs, this should improve performance by one clock
299 * as a nice side bonus.
300 */
301 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
302 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
303 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
304 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
305 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
306 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
307 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
308 0,
309 1,
310 t_dst_index(vp, &vpi->DstReg),
311 t_dst_mask(vpi->DstReg.WriteMask),
312 t_dst_class(vpi->DstReg.File));
313 } else {
314 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
315 0,
316 0,
317 t_dst_index(vp, &vpi->DstReg),
318 t_dst_mask(vpi->DstReg.WriteMask),
319 t_dst_class(vpi->DstReg.File));
320
321 /* Arguments with constant swizzles still count as a unique
322 * temporary, so we should make sure these arguments share a
323 * register index with one of the other arguments. */
324 for (i = 0; i < 3; i++) {
325 unsigned int j;
326 if (vpi->SrcReg[i].File != RC_FILE_NONE)
327 continue;
328
329 for (j = 0; j < 3; j++) {
330 if (i != j) {
331 vpi->SrcReg[i].Index =
332 vpi->SrcReg[j].Index;
333 break;
334 }
335 }
336 }
337 }
338 inst[1] = t_src(vp, &vpi->SrcReg[0]);
339 inst[2] = t_src(vp, &vpi->SrcReg[1]);
340 inst[3] = t_src(vp, &vpi->SrcReg[2]);
341 }
342
343 static void ei_pow(struct r300_vertex_program_code *vp,
344 struct rc_sub_instruction *vpi,
345 unsigned int * inst)
346 {
347 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
348 1,
349 0,
350 t_dst_index(vp, &vpi->DstReg),
351 t_dst_mask(vpi->DstReg.WriteMask),
352 t_dst_class(vpi->DstReg.File));
353 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
354 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
355 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
356 }
357
358 static void translate_vertex_program(struct radeon_compiler *c, void *user)
359 {
360 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
361 struct rc_instruction *rci;
362
363 unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
364 unsigned loop_depth = 0;
365
366 compiler->code->pos_end = 0; /* Not supported yet */
367 compiler->code->length = 0;
368 compiler->code->num_temporaries = 0;
369
370 compiler->SetHwInputOutput(compiler);
371
372 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
373 struct rc_sub_instruction *vpi = &rci->U.I;
374 unsigned int *inst = compiler->code->body.d + compiler->code->length;
375 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
376
377 /* Skip instructions writing to non-existing destination */
378 if (!valid_dst(compiler->code, &vpi->DstReg))
379 continue;
380
381 if (info->HasDstReg) {
382 /* Neither is Saturate. */
383 if (vpi->SaturateMode != RC_SATURATE_NONE) {
384 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
385 "modifier (yet).\n");
386 }
387 }
388
389 if (compiler->code->length >= c->max_alu_insts * 4) {
390 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
391 return;
392 }
393
394 assert(compiler->Base.is_r500 ||
395 (vpi->Opcode != RC_OPCODE_SEQ &&
396 vpi->Opcode != RC_OPCODE_SNE));
397
398 switch (vpi->Opcode) {
399 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
400 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
401 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
402 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
403 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
404 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
405 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
406 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
407 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
408 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
409 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
410 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
411 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
412 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
413 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
414 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
415 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
416 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
417 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
418 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
419 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
420 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
421 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
422 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
423 case RC_OPCODE_BGNLOOP:
424 {
425 if ((!compiler->Base.is_r500
426 && loop_depth >= R300_VS_MAX_LOOP_DEPTH)
427 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
428 rc_error(&compiler->Base,
429 "Loops are nested too deep.");
430 return;
431 }
432 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
433 break;
434 }
435 case RC_OPCODE_ENDLOOP:
436 {
437 unsigned int act_addr;
438 unsigned int last_addr;
439 unsigned int ret_addr;
440
441 ret_addr = loops[--loop_depth];
442 act_addr = ret_addr - 1;
443 last_addr = (compiler->code->length / 4) - 1;
444
445 if (loop_depth >= R300_VS_MAX_FC_OPS) {
446 rc_error(&compiler->Base,
447 "Too many flow control instructions.");
448 return;
449 }
450 if (compiler->Base.is_r500) {
451 compiler->code->fc_op_addrs.r500
452 [compiler->code->num_fc_ops].lw =
453 R500_PVS_FC_ACT_ADRS(act_addr)
454 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
455 ;
456 compiler->code->fc_op_addrs.r500
457 [compiler->code->num_fc_ops].uw =
458 R500_PVS_FC_LAST_INST(last_addr)
459 | R500_PVS_FC_RTN_INST(ret_addr)
460 ;
461 } else {
462 compiler->code->fc_op_addrs.r300
463 [compiler->code->num_fc_ops] =
464 R300_PVS_FC_ACT_ADRS(act_addr)
465 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
466 | R300_PVS_FC_LAST_INST(last_addr)
467 | R300_PVS_FC_RTN_INST(ret_addr)
468 ;
469 }
470 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
471 R300_PVS_FC_LOOP_INIT_VAL(0x0)
472 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
473 ;
474 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
475 compiler->code->num_fc_ops);
476 compiler->code->num_fc_ops++;
477
478 break;
479 }
480
481 case RC_ME_PRED_SET_CLR:
482 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
483 break;
484
485 case RC_ME_PRED_SET_INV:
486 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
487 break;
488
489 case RC_ME_PRED_SET_POP:
490 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
491 break;
492
493 case RC_ME_PRED_SET_RESTORE:
494 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
495 break;
496
497 case RC_ME_PRED_SEQ:
498 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
499 break;
500
501 case RC_ME_PRED_SNEQ:
502 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
503 break;
504
505 case RC_VE_PRED_SNEQ_PUSH:
506 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
507 vpi, inst);
508 break;
509
510 default:
511 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
512 return;
513 }
514
515 if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
516 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
517 << PVS_DST_PRED_ENABLE_SHIFT);
518 if (vpi->DstReg.Pred == RC_PRED_SET) {
519 inst[0] |= (PVS_DST_PRED_SENSE_MASK
520 << PVS_DST_PRED_SENSE_SHIFT);
521 }
522 }
523
524 /* Update the number of temporaries. */
525 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
526 vpi->DstReg.Index >= compiler->code->num_temporaries)
527 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
528
529 for (unsigned i = 0; i < info->NumSrcRegs; i++)
530 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
531 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
532 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
533
534 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
535 rc_error(&compiler->Base, "Too many temporaries.\n");
536 return;
537 }
538
539 compiler->code->length += 4;
540
541 if (compiler->Base.Error)
542 return;
543 }
544 }
545
546 struct temporary_allocation {
547 unsigned int Allocated:1;
548 unsigned int HwTemp:15;
549 struct rc_instruction * LastRead;
550 };
551
552 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
553 {
554 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
555 struct rc_instruction *inst;
556 struct rc_instruction *end_loop = NULL;
557 unsigned int num_orig_temps = 0;
558 char hwtemps[RC_REGISTER_MAX_INDEX];
559 struct temporary_allocation * ta;
560 unsigned int i, j;
561
562 memset(hwtemps, 0, sizeof(hwtemps));
563
564 rc_recompute_ips(c);
565
566 /* Pass 1: Count original temporaries. */
567 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
568 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
569
570 for (i = 0; i < opcode->NumSrcRegs; ++i) {
571 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
572 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
573 num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
574 }
575 }
576
577 if (opcode->HasDstReg) {
578 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
579 if (inst->U.I.DstReg.Index >= num_orig_temps)
580 num_orig_temps = inst->U.I.DstReg.Index + 1;
581 }
582 }
583 }
584
585 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
586 sizeof(struct temporary_allocation) * num_orig_temps);
587 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
588
589 /* Pass 2: Determine original temporary lifetimes */
590 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
591 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
592 /* Instructions inside of loops need to use the ENDLOOP
593 * instruction as their LastRead. */
594 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
595 int endloops = 1;
596 struct rc_instruction * ptr;
597 for(ptr = inst->Next;
598 ptr != &compiler->Base.Program.Instructions;
599 ptr = ptr->Next){
600 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
601 endloops++;
602 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
603 endloops--;
604 if (endloops <= 0) {
605 end_loop = ptr;
606 break;
607 }
608 }
609 }
610 }
611
612 if (inst == end_loop) {
613 end_loop = NULL;
614 continue;
615 }
616
617 for (i = 0; i < opcode->NumSrcRegs; ++i) {
618 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
619 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
620 }
621 }
622 }
623
624 /* Pass 3: Register allocation */
625 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
626 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
627
628 for (i = 0; i < opcode->NumSrcRegs; ++i) {
629 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
630 unsigned int orig = inst->U.I.SrcReg[i].Index;
631 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
632
633 if (ta[orig].Allocated && inst == ta[orig].LastRead)
634 hwtemps[ta[orig].HwTemp] = 0;
635 }
636 }
637
638 if (opcode->HasDstReg) {
639 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
640 unsigned int orig = inst->U.I.DstReg.Index;
641
642 if (!ta[orig].Allocated) {
643 for(j = 0; j < c->max_temp_regs; ++j) {
644 if (!hwtemps[j])
645 break;
646 }
647 ta[orig].Allocated = 1;
648 ta[orig].HwTemp = j;
649 hwtemps[ta[orig].HwTemp] = 1;
650 }
651
652 inst->U.I.DstReg.Index = ta[orig].HwTemp;
653 }
654 }
655 }
656 }
657
658 /**
659 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
660 * and the Saturate opcode modifier. Only Absolute is currently transformed.
661 */
662 static int transform_nonnative_modifiers(
663 struct radeon_compiler *c,
664 struct rc_instruction *inst,
665 void* unused)
666 {
667 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
668 unsigned i;
669
670 /* Transform ABS(a) to MAX(a, -a). */
671 for (i = 0; i < opcode->NumSrcRegs; i++) {
672 if (inst->U.I.SrcReg[i].Abs) {
673 struct rc_instruction *new_inst;
674 unsigned temp;
675
676 inst->U.I.SrcReg[i].Abs = 0;
677
678 temp = rc_find_free_temporary(c);
679
680 new_inst = rc_insert_new_instruction(c, inst->Prev);
681 new_inst->U.I.Opcode = RC_OPCODE_MAX;
682 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
683 new_inst->U.I.DstReg.Index = temp;
684 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
685 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
686 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
687
688 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
689 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
690 inst->U.I.SrcReg[i].Index = temp;
691 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
692 }
693 }
694 return 1;
695 }
696
697 /**
698 * Vertex engine cannot read two inputs or two constants at the same time.
699 * Introduce intermediate MOVs to temporary registers to account for this.
700 */
701 static int transform_source_conflicts(
702 struct radeon_compiler *c,
703 struct rc_instruction* inst,
704 void* unused)
705 {
706 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
707
708 if (opcode->NumSrcRegs == 3) {
709 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
710 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
711 int tmpreg = rc_find_free_temporary(c);
712 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
713 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
714 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
715 inst_mov->U.I.DstReg.Index = tmpreg;
716 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
717
718 reset_srcreg(&inst->U.I.SrcReg[2]);
719 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
720 inst->U.I.SrcReg[2].Index = tmpreg;
721 }
722 }
723
724 if (opcode->NumSrcRegs >= 2) {
725 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
726 int tmpreg = rc_find_free_temporary(c);
727 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
728 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
729 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
730 inst_mov->U.I.DstReg.Index = tmpreg;
731 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
732
733 reset_srcreg(&inst->U.I.SrcReg[1]);
734 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
735 inst->U.I.SrcReg[1].Index = tmpreg;
736 }
737 }
738
739 return 1;
740 }
741
742 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
743 {
744 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
745 int i;
746
747 for(i = 0; i < 32; ++i) {
748 if ((compiler->RequiredOutputs & (1 << i)) &&
749 !(compiler->Base.Program.OutputsWritten & (1 << i))) {
750 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
751 inst->U.I.Opcode = RC_OPCODE_MOV;
752
753 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
754 inst->U.I.DstReg.Index = i;
755 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
756
757 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
758 inst->U.I.SrcReg[0].Index = 0;
759 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
760
761 compiler->Base.Program.OutputsWritten |= 1 << i;
762 }
763 }
764 }
765
766 static void dataflow_outputs_mark_used(void * userdata, void * data,
767 void (*callback)(void *, unsigned int, unsigned int))
768 {
769 struct r300_vertex_program_compiler * c = userdata;
770 int i;
771
772 for(i = 0; i < 32; ++i) {
773 if (c->RequiredOutputs & (1 << i))
774 callback(data, i, RC_MASK_XYZW);
775 }
776 }
777
778 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
779 {
780 (void) opcode;
781 (void) reg;
782
783 return 1;
784 }
785
786 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
787 struct rc_instruction *arl,
788 struct rc_instruction *end,
789 int min_offset)
790 {
791 struct rc_instruction *inst, *add;
792 unsigned const_swizzle;
793
794 /* Transform ARL */
795 add = rc_insert_new_instruction(&c->Base, arl->Prev);
796 add->U.I.Opcode = RC_OPCODE_ADD;
797 add->U.I.DstReg.File = RC_FILE_TEMPORARY;
798 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
799 add->U.I.DstReg.WriteMask = RC_MASK_X;
800 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
801 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
802 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
803 min_offset, &const_swizzle);
804 add->U.I.SrcReg[1].Swizzle = const_swizzle;
805
806 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
807 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
808 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
809
810 /* Rewrite offsets up to and excluding inst. */
811 for (inst = arl->Next; inst != end; inst = inst->Next) {
812 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
813
814 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
815 if (inst->U.I.SrcReg[i].RelAddr)
816 inst->U.I.SrcReg[i].Index -= min_offset;
817 }
818 }
819
820 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
821 {
822 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
823 struct rc_instruction *inst, *lastARL = NULL;
824 int min_offset = 0;
825
826 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
827 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
828
829 if (inst->U.I.Opcode == RC_OPCODE_ARL) {
830 if (lastARL != NULL && min_offset < 0)
831 transform_negative_addressing(c, lastARL, inst, min_offset);
832
833 lastARL = inst;
834 min_offset = 0;
835 continue;
836 }
837
838 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
839 if (inst->U.I.SrcReg[i].RelAddr &&
840 inst->U.I.SrcReg[i].Index < 0) {
841 /* ARL must precede any indirect addressing. */
842 if (lastARL == NULL) {
843 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
844 return;
845 }
846
847 if (inst->U.I.SrcReg[i].Index < min_offset)
848 min_offset = inst->U.I.SrcReg[i].Index;
849 }
850 }
851 }
852
853 if (lastARL != NULL && min_offset < 0)
854 transform_negative_addressing(c, lastARL, inst, min_offset);
855 }
856
857 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
858 .IsNative = &swizzle_is_native,
859 .Split = 0 /* should never be called */
860 };
861
862 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
863 {
864 int is_r500 = c->Base.is_r500;
865 int opt = !c->Base.disable_optimizations;
866
867 /* Lists of instruction transformations. */
868 struct radeon_program_transformation alu_rewrite_r500[] = {
869 { &r300_transform_vertex_alu, 0 },
870 { &r300_transform_trig_scale_vertex, 0 },
871 { 0, 0 }
872 };
873
874 struct radeon_program_transformation alu_rewrite_r300[] = {
875 { &r300_transform_vertex_alu, 0 },
876 { &r300_transform_trig_simple, 0 },
877 { 0, 0 }
878 };
879
880 /* Note: These passes have to be done seperately from ALU rewrite,
881 * otherwise non-native ALU instructions with source conflits
882 * or non-native modifiers will not be treated properly.
883 */
884 struct radeon_program_transformation emulate_modifiers[] = {
885 { &transform_nonnative_modifiers, 0 },
886 { 0, 0 }
887 };
888
889 struct radeon_program_transformation resolve_src_conflicts[] = {
890 { &transform_source_conflicts, 0 },
891 { 0, 0 }
892 };
893
894 /* List of compiler passes. */
895 struct radeon_compiler_pass vs_list[] = {
896 /* NAME DUMP PREDICATE FUNCTION PARAM */
897 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
898 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
899 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
900 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
901 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
902 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
903 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
904 {"dataflow optimize", 1, opt, rc_optimize, NULL},
905 /* This pass must be done after optimizations. */
906 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
907 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
908 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
909 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
910 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
911 {"machine code generation", 0, 1, translate_vertex_program, NULL},
912 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
913 {NULL, 0, 0, NULL, NULL}
914 };
915
916 c->Base.type = RC_VERTEX_PROGRAM;
917 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
918
919 rc_run_compiler(&c->Base, vs_list);
920
921 c->code->InputsRead = c->Base.Program.InputsRead;
922 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
923 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
924 }