r300/compiler: don't terminate regalloc if we surpass max temps limit
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program_alu.h"
32 #include "radeon_swizzle.h"
33 #include "radeon_emulate_branches.h"
34 #include "radeon_emulate_loops.h"
35 #include "radeon_remove_constants.h"
36
37 struct loop {
38 int BgnLoop;
39
40 };
41
42 /*
43 * Take an already-setup and valid source then swizzle it appropriately to
44 * obtain a constant ZERO or ONE source.
45 */
46 #define __CONST(x, y) \
47 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
48 t_swizzle(y), \
49 t_swizzle(y), \
50 t_swizzle(y), \
51 t_swizzle(y), \
52 t_src_class(vpi->SrcReg[x].File), \
53 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
54
55
56 static unsigned long t_dst_mask(unsigned int mask)
57 {
58 /* RC_MASK_* is equivalent to VSF_FLAG_* */
59 return mask & RC_MASK_XYZW;
60 }
61
62 static unsigned long t_dst_class(rc_register_file file)
63 {
64 switch (file) {
65 default:
66 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
67 /* fall-through */
68 case RC_FILE_TEMPORARY:
69 return PVS_DST_REG_TEMPORARY;
70 case RC_FILE_OUTPUT:
71 return PVS_DST_REG_OUT;
72 case RC_FILE_ADDRESS:
73 return PVS_DST_REG_A0;
74 }
75 }
76
77 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
78 struct rc_dst_register *dst)
79 {
80 if (dst->File == RC_FILE_OUTPUT)
81 return vp->outputs[dst->Index];
82
83 return dst->Index;
84 }
85
86 static unsigned long t_src_class(rc_register_file file)
87 {
88 switch (file) {
89 default:
90 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
91 /* fall-through */
92 case RC_FILE_NONE:
93 case RC_FILE_TEMPORARY:
94 return PVS_SRC_REG_TEMPORARY;
95 case RC_FILE_INPUT:
96 return PVS_SRC_REG_INPUT;
97 case RC_FILE_CONSTANT:
98 return PVS_SRC_REG_CONSTANT;
99 }
100 }
101
102 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
103 {
104 unsigned long aclass = t_src_class(a.File);
105 unsigned long bclass = t_src_class(b.File);
106
107 if (aclass != bclass)
108 return 0;
109 if (aclass == PVS_SRC_REG_TEMPORARY)
110 return 0;
111
112 if (a.RelAddr || b.RelAddr)
113 return 1;
114 if (a.Index != b.Index)
115 return 1;
116
117 return 0;
118 }
119
120 static inline unsigned long t_swizzle(unsigned int swizzle)
121 {
122 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
123 return swizzle;
124 }
125
126 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
127 struct rc_src_register *src)
128 {
129 if (src->File == RC_FILE_INPUT) {
130 assert(vp->inputs[src->Index] != -1);
131 return vp->inputs[src->Index];
132 } else {
133 if (src->Index < 0) {
134 fprintf(stderr,
135 "negative offsets for indirect addressing do not work.\n");
136 return 0;
137 }
138 return src->Index;
139 }
140 }
141
142 /* these two functions should probably be merged... */
143
144 static unsigned long t_src(struct r300_vertex_program_code *vp,
145 struct rc_src_register *src)
146 {
147 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
148 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
149 */
150 return PVS_SRC_OPERAND(t_src_index(vp, src),
151 t_swizzle(GET_SWZ(src->Swizzle, 0)),
152 t_swizzle(GET_SWZ(src->Swizzle, 1)),
153 t_swizzle(GET_SWZ(src->Swizzle, 2)),
154 t_swizzle(GET_SWZ(src->Swizzle, 3)),
155 t_src_class(src->File),
156 src->Negate) |
157 (src->RelAddr << 4) | (src->Abs << 3);
158 }
159
160 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
161 struct rc_src_register *src)
162 {
163 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
164 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
165 */
166 return PVS_SRC_OPERAND(t_src_index(vp, src),
167 t_swizzle(GET_SWZ(src->Swizzle, 0)),
168 t_swizzle(GET_SWZ(src->Swizzle, 0)),
169 t_swizzle(GET_SWZ(src->Swizzle, 0)),
170 t_swizzle(GET_SWZ(src->Swizzle, 0)),
171 t_src_class(src->File),
172 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
173 (src->RelAddr << 4) | (src->Abs << 3);
174 }
175
176 static int valid_dst(struct r300_vertex_program_code *vp,
177 struct rc_dst_register *dst)
178 {
179 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
180 return 0;
181 } else if (dst->File == RC_FILE_ADDRESS) {
182 assert(dst->Index == 0);
183 }
184
185 return 1;
186 }
187
188 static void ei_vector1(struct r300_vertex_program_code *vp,
189 unsigned int hw_opcode,
190 struct rc_sub_instruction *vpi,
191 unsigned int * inst)
192 {
193 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
194 0,
195 0,
196 t_dst_index(vp, &vpi->DstReg),
197 t_dst_mask(vpi->DstReg.WriteMask),
198 t_dst_class(vpi->DstReg.File));
199 inst[1] = t_src(vp, &vpi->SrcReg[0]);
200 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
201 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
202 }
203
204 static void ei_vector2(struct r300_vertex_program_code *vp,
205 unsigned int hw_opcode,
206 struct rc_sub_instruction *vpi,
207 unsigned int * inst)
208 {
209 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
210 0,
211 0,
212 t_dst_index(vp, &vpi->DstReg),
213 t_dst_mask(vpi->DstReg.WriteMask),
214 t_dst_class(vpi->DstReg.File));
215 inst[1] = t_src(vp, &vpi->SrcReg[0]);
216 inst[2] = t_src(vp, &vpi->SrcReg[1]);
217 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
218 }
219
220 static void ei_math1(struct r300_vertex_program_code *vp,
221 unsigned int hw_opcode,
222 struct rc_sub_instruction *vpi,
223 unsigned int * inst)
224 {
225 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
226 1,
227 0,
228 t_dst_index(vp, &vpi->DstReg),
229 t_dst_mask(vpi->DstReg.WriteMask),
230 t_dst_class(vpi->DstReg.File));
231 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
232 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
233 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
234 }
235
236 static void ei_lit(struct r300_vertex_program_code *vp,
237 struct rc_sub_instruction *vpi,
238 unsigned int * inst)
239 {
240 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
241
242 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
243 1,
244 0,
245 t_dst_index(vp, &vpi->DstReg),
246 t_dst_mask(vpi->DstReg.WriteMask),
247 t_dst_class(vpi->DstReg.File));
248 /* NOTE: Users swizzling might not work. */
249 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
250 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
251 PVS_SRC_SELECT_FORCE_0, // Z
252 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
253 t_src_class(vpi->SrcReg[0].File),
254 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
255 (vpi->SrcReg[0].RelAddr << 4);
256 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
258 PVS_SRC_SELECT_FORCE_0, // Z
259 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
260 t_src_class(vpi->SrcReg[0].File),
261 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
262 (vpi->SrcReg[0].RelAddr << 4);
263 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
264 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
265 PVS_SRC_SELECT_FORCE_0, // Z
266 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
267 t_src_class(vpi->SrcReg[0].File),
268 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
269 (vpi->SrcReg[0].RelAddr << 4);
270 }
271
272 static void ei_mad(struct r300_vertex_program_code *vp,
273 struct rc_sub_instruction *vpi,
274 unsigned int * inst)
275 {
276 /* Remarks about hardware limitations of MAD
277 * (please preserve this comment, as this information is _NOT_
278 * in the documentation provided by AMD).
279 *
280 * As described in the documentation, MAD with three unique temporary
281 * source registers requires the use of the macro version.
282 *
283 * However (and this is not mentioned in the documentation), apparently
284 * the macro version is _NOT_ a full superset of the normal version.
285 * In particular, the macro version does not always work when relative
286 * addressing is used in the source operands.
287 *
288 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
289 * assembly shader path when using medium quality animations
290 * (i.e. animations with matrix blending instead of quaternion blending).
291 *
292 * Unfortunately, I (nha) have been unable to extract a Piglit regression
293 * test for this issue - for some reason, it is possible to have vertex
294 * programs whose prefix is *exactly* the same as the prefix of the
295 * offending program in Sauerbraten up to the offending instruction
296 * without causing any trouble.
297 *
298 * Bottom line: Only use the macro version only when really necessary;
299 * according to AMD docs, this should improve performance by one clock
300 * as a nice side bonus.
301 */
302 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
303 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
304 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
305 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
306 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
307 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
308 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
309 0,
310 1,
311 t_dst_index(vp, &vpi->DstReg),
312 t_dst_mask(vpi->DstReg.WriteMask),
313 t_dst_class(vpi->DstReg.File));
314 } else {
315 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
316 0,
317 0,
318 t_dst_index(vp, &vpi->DstReg),
319 t_dst_mask(vpi->DstReg.WriteMask),
320 t_dst_class(vpi->DstReg.File));
321 }
322 inst[1] = t_src(vp, &vpi->SrcReg[0]);
323 inst[2] = t_src(vp, &vpi->SrcReg[1]);
324 inst[3] = t_src(vp, &vpi->SrcReg[2]);
325 }
326
327 static void ei_pow(struct r300_vertex_program_code *vp,
328 struct rc_sub_instruction *vpi,
329 unsigned int * inst)
330 {
331 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
332 1,
333 0,
334 t_dst_index(vp, &vpi->DstReg),
335 t_dst_mask(vpi->DstReg.WriteMask),
336 t_dst_class(vpi->DstReg.File));
337 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
338 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
339 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
340 }
341
342 static void mark_write(void * userdata, struct rc_instruction * inst,
343 rc_register_file file, unsigned int index, unsigned int mask)
344 {
345 unsigned int * writemasks = userdata;
346
347 if (file != RC_FILE_TEMPORARY)
348 return;
349
350 if (index >= R300_VS_MAX_TEMPS)
351 return;
352
353 writemasks[index] |= mask;
354 }
355
356 static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
357 {
358 return PVS_SRC_OPERAND(compiler->PredicateIndex,
359 t_swizzle(RC_SWIZZLE_ZERO),
360 t_swizzle(RC_SWIZZLE_ZERO),
361 t_swizzle(RC_SWIZZLE_ZERO),
362 t_swizzle(RC_SWIZZLE_W),
363 t_src_class(RC_FILE_TEMPORARY),
364 0);
365 }
366
367 static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
368 unsigned int hw_opcode, int is_math)
369 {
370 return PVS_OP_DST_OPERAND(hw_opcode,
371 is_math,
372 0,
373 compiler->PredicateIndex,
374 RC_MASK_W,
375 t_dst_class(RC_FILE_TEMPORARY));
376
377 }
378
379 static void ei_if(struct r300_vertex_program_compiler * compiler,
380 struct rc_instruction *rci,
381 unsigned int * inst,
382 unsigned int branch_depth)
383 {
384 unsigned int predicate_opcode;
385 int is_math = 0;
386
387 if (!compiler->Base.is_r500) {
388 rc_error(&compiler->Base,"Opcode IF not supported\n");
389 return;
390 }
391
392 /* Reserve a temporary to use as our predicate stack counter, if we
393 * don't already have one. */
394 if (!compiler->PredicateMask) {
395 unsigned int writemasks[RC_REGISTER_MAX_INDEX];
396 struct rc_instruction * inst;
397 unsigned int i;
398 memset(writemasks, 0, sizeof(writemasks));
399 for(inst = compiler->Base.Program.Instructions.Next;
400 inst != &compiler->Base.Program.Instructions;
401 inst = inst->Next) {
402 rc_for_all_writes_mask(inst, mark_write, writemasks);
403 }
404 for(i = 0; i < compiler->Base.max_temp_regs; i++) {
405 unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
406 /* Only the W component can be used fo the predicate
407 * stack counter. */
408 if (mask & RC_MASK_W) {
409 compiler->PredicateMask = RC_MASK_W;
410 compiler->PredicateIndex = i;
411 break;
412 }
413 }
414 if (i == compiler->Base.max_temp_regs) {
415 rc_error(&compiler->Base, "No free temporary to use for"
416 " predicate stack counter.\n");
417 return;
418 }
419 }
420 predicate_opcode =
421 branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
422
423 rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
424 if (branch_depth == 0) {
425 is_math = 1;
426 predicate_opcode = ME_PRED_SET_NEQ;
427 inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
428 inst[2] = 0;
429 } else {
430 predicate_opcode = VE_PRED_SET_NEQ_PUSH;
431 inst[1] = t_pred_src(compiler);
432 inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
433 }
434
435 inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
436 inst[3] = 0;
437
438 }
439
440 static void ei_else(struct r300_vertex_program_compiler * compiler,
441 unsigned int * inst)
442 {
443 if (!compiler->Base.is_r500) {
444 rc_error(&compiler->Base,"Opcode ELSE not supported\n");
445 return;
446 }
447 inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
448 inst[1] = t_pred_src(compiler);
449 inst[2] = 0;
450 inst[3] = 0;
451 }
452
453 static void ei_endif(struct r300_vertex_program_compiler *compiler,
454 unsigned int * inst)
455 {
456 if (!compiler->Base.is_r500) {
457 rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
458 return;
459 }
460 inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
461 inst[1] = t_pred_src(compiler);
462 inst[2] = 0;
463 inst[3] = 0;
464 }
465
466 static void translate_vertex_program(struct radeon_compiler *c, void *user)
467 {
468 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
469 struct rc_instruction *rci;
470
471 struct loop * loops = NULL;
472 int current_loop_depth = 0;
473 int loops_reserved = 0;
474
475 unsigned int branch_depth = 0;
476
477 compiler->code->pos_end = 0; /* Not supported yet */
478 compiler->code->length = 0;
479 compiler->code->num_temporaries = 0;
480
481 compiler->SetHwInputOutput(compiler);
482
483 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
484 struct rc_sub_instruction *vpi = &rci->U.I;
485 unsigned int *inst = compiler->code->body.d + compiler->code->length;
486 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
487
488 /* Skip instructions writing to non-existing destination */
489 if (!valid_dst(compiler->code, &vpi->DstReg))
490 continue;
491
492 if (info->HasDstReg) {
493 /* Relative addressing of destination operands is not supported yet. */
494 if (vpi->DstReg.RelAddr) {
495 rc_error(&compiler->Base, "Vertex program does not support relative "
496 "addressing of destination operands (yet).\n");
497 return;
498 }
499
500 /* Neither is Saturate. */
501 if (vpi->SaturateMode != RC_SATURATE_NONE) {
502 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
503 "modifier (yet).\n");
504 }
505 }
506
507 if (compiler->code->length >= c->max_alu_insts * 4) {
508 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
509 return;
510 }
511
512 assert(compiler->Base.is_r500 ||
513 (vpi->Opcode != RC_OPCODE_SEQ &&
514 vpi->Opcode != RC_OPCODE_SNE));
515
516 switch (vpi->Opcode) {
517 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
518 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
519 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
520 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
521 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
522 case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
523 case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
524 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
525 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
526 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
527 case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
528 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
529 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
530 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
531 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
532 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
533 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
534 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
535 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
536 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
537 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
538 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
539 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
540 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
541 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
542 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
543 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
544 case RC_OPCODE_BGNLOOP:
545 {
546 struct loop * l;
547
548 if ((!compiler->Base.is_r500
549 && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
550 || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
551 rc_error(&compiler->Base,
552 "Loops are nested too deep.");
553 return;
554 }
555 memory_pool_array_reserve(&compiler->Base.Pool,
556 struct loop, loops, current_loop_depth,
557 loops_reserved, 1);
558 l = &loops[current_loop_depth++];
559 memset(l , 0, sizeof(struct loop));
560 l->BgnLoop = (compiler->code->length / 4);
561 continue;
562 }
563 case RC_OPCODE_ENDLOOP:
564 {
565 struct loop * l;
566 unsigned int act_addr;
567 unsigned int last_addr;
568 unsigned int ret_addr;
569
570 assert(loops);
571 l = &loops[current_loop_depth - 1];
572 act_addr = l->BgnLoop - 1;
573 last_addr = (compiler->code->length / 4) - 1;
574 ret_addr = l->BgnLoop;
575
576 if (loops_reserved >= R300_VS_MAX_FC_OPS) {
577 rc_error(&compiler->Base,
578 "Too many flow control instructions.");
579 return;
580 }
581 if (compiler->Base.is_r500) {
582 compiler->code->fc_op_addrs.r500
583 [compiler->code->num_fc_ops].lw =
584 R500_PVS_FC_ACT_ADRS(act_addr)
585 | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
586 ;
587 compiler->code->fc_op_addrs.r500
588 [compiler->code->num_fc_ops].uw =
589 R500_PVS_FC_LAST_INST(last_addr)
590 | R500_PVS_FC_RTN_INST(ret_addr)
591 ;
592 } else {
593 compiler->code->fc_op_addrs.r300
594 [compiler->code->num_fc_ops] =
595 R300_PVS_FC_ACT_ADRS(act_addr)
596 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
597 | R300_PVS_FC_LAST_INST(last_addr)
598 | R300_PVS_FC_RTN_INST(ret_addr)
599 ;
600 }
601 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
602 R300_PVS_FC_LOOP_INIT_VAL(0x0)
603 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
604 ;
605 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
606 compiler->code->num_fc_ops);
607 compiler->code->num_fc_ops++;
608 current_loop_depth--;
609 continue;
610 }
611
612 default:
613 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
614 return;
615 }
616
617 /* Non-flow control instructions that are inside an if statement
618 * need to pay attention to the predicate bit. */
619 if (branch_depth
620 && vpi->Opcode != RC_OPCODE_IF
621 && vpi->Opcode != RC_OPCODE_ELSE
622 && vpi->Opcode != RC_OPCODE_ENDIF) {
623
624 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
625 << PVS_DST_PRED_ENABLE_SHIFT);
626 inst[0] |= (PVS_DST_PRED_SENSE_MASK
627 << PVS_DST_PRED_SENSE_SHIFT);
628 }
629
630 /* Update the number of temporaries. */
631 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
632 vpi->DstReg.Index >= compiler->code->num_temporaries)
633 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
634
635 for (unsigned i = 0; i < info->NumSrcRegs; i++)
636 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
637 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
638 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
639
640 if (compiler->PredicateMask)
641 if (compiler->PredicateIndex >= compiler->code->num_temporaries)
642 compiler->code->num_temporaries = compiler->PredicateIndex + 1;
643
644 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
645 rc_error(&compiler->Base, "Too many temporaries.\n");
646 return;
647 }
648
649 compiler->code->length += 4;
650
651 if (compiler->Base.Error)
652 return;
653 }
654 }
655
656 struct temporary_allocation {
657 unsigned int Allocated:1;
658 unsigned int HwTemp:15;
659 struct rc_instruction * LastRead;
660 };
661
662 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
663 {
664 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
665 struct rc_instruction *inst;
666 struct rc_instruction *end_loop = NULL;
667 unsigned int num_orig_temps = 0;
668 char hwtemps[RC_REGISTER_MAX_INDEX];
669 struct temporary_allocation * ta;
670 unsigned int i, j;
671 struct rc_instruction *last_inst_src_reladdr = NULL;
672
673 memset(hwtemps, 0, sizeof(hwtemps));
674
675 rc_recompute_ips(c);
676
677 /* Pass 1: Count original temporaries. */
678 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
679 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
680
681 for (i = 0; i < opcode->NumSrcRegs; ++i) {
682 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
683 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
684 num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
685 }
686 }
687
688 if (opcode->HasDstReg) {
689 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
690 if (inst->U.I.DstReg.Index >= num_orig_temps)
691 num_orig_temps = inst->U.I.DstReg.Index + 1;
692 }
693 }
694 }
695
696 /* Pass 2: If there is relative addressing of dst temporaries, we cannot change register indices. Give up.
697 * For src temporaries, save the last instruction which uses relative addressing. */
698 for (inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
699 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
700
701 if (opcode->HasDstReg)
702 if (inst->U.I.DstReg.RelAddr)
703 return;
704
705 for (i = 0; i < opcode->NumSrcRegs; ++i) {
706 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
707 inst->U.I.SrcReg[i].RelAddr) {
708 last_inst_src_reladdr = inst;
709 }
710 }
711 }
712
713 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
714 sizeof(struct temporary_allocation) * num_orig_temps);
715 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
716
717 /* Pass 3: Determine original temporary lifetimes */
718 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
719 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
720 /* Instructions inside of loops need to use the ENDLOOP
721 * instruction as their LastRead. */
722 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
723 int endloops = 1;
724 struct rc_instruction * ptr;
725 for(ptr = inst->Next;
726 ptr != &compiler->Base.Program.Instructions;
727 ptr = ptr->Next){
728 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
729 endloops++;
730 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
731 endloops--;
732 if (endloops <= 0) {
733 end_loop = ptr;
734 break;
735 }
736 }
737 }
738 }
739
740 if (inst == end_loop) {
741 end_loop = NULL;
742 continue;
743 }
744
745 for (i = 0; i < opcode->NumSrcRegs; ++i) {
746 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
747 struct rc_instruction *last_read;
748
749 /* From "last_inst_src_reladdr", "end_loop", and "inst",
750 * select the instruction with the highest instruction index (IP).
751 * Note that "end_loop", if available, has always a higher index than "inst". */
752 if (last_inst_src_reladdr) {
753 if (end_loop) {
754 last_read = last_inst_src_reladdr->IP > end_loop->IP ?
755 last_inst_src_reladdr : end_loop;
756 } else {
757 last_read = last_inst_src_reladdr->IP > inst->IP ?
758 last_inst_src_reladdr : inst;
759 }
760 } else {
761 last_read = end_loop ? end_loop : inst;
762 }
763
764 ta[inst->U.I.SrcReg[i].Index].LastRead = last_read;
765 }
766 }
767 }
768
769 /* Pass 4: Register allocation */
770 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
771 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
772
773 if (!last_inst_src_reladdr || last_inst_src_reladdr->IP < inst->IP) {
774 for (i = 0; i < opcode->NumSrcRegs; ++i) {
775 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
776 unsigned int orig = inst->U.I.SrcReg[i].Index;
777 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
778
779 if (ta[orig].Allocated && inst == ta[orig].LastRead)
780 hwtemps[ta[orig].HwTemp] = 0;
781 }
782 }
783 }
784
785 if (opcode->HasDstReg) {
786 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
787 unsigned int orig = inst->U.I.DstReg.Index;
788
789 if (!ta[orig].Allocated) {
790 for(j = 0; j < c->max_temp_regs; ++j) {
791 if (!hwtemps[j])
792 break;
793 }
794 ta[orig].Allocated = 1;
795 if (last_inst_src_reladdr &&
796 last_inst_src_reladdr->IP > inst->IP) {
797 ta[orig].HwTemp = orig;
798 } else {
799 ta[orig].HwTemp = j;
800 }
801 hwtemps[ta[orig].HwTemp] = 1;
802 }
803
804 inst->U.I.DstReg.Index = ta[orig].HwTemp;
805 }
806 }
807 }
808 }
809
810 /**
811 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
812 * and the Saturate opcode modifier. Only Absolute is currently transformed.
813 */
814 static int transform_nonnative_modifiers(
815 struct radeon_compiler *c,
816 struct rc_instruction *inst,
817 void* unused)
818 {
819 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
820 unsigned i;
821
822 /* Transform ABS(a) to MAX(a, -a). */
823 for (i = 0; i < opcode->NumSrcRegs; i++) {
824 if (inst->U.I.SrcReg[i].Abs) {
825 struct rc_instruction *new_inst;
826 unsigned temp;
827
828 inst->U.I.SrcReg[i].Abs = 0;
829
830 temp = rc_find_free_temporary(c);
831
832 new_inst = rc_insert_new_instruction(c, inst->Prev);
833 new_inst->U.I.Opcode = RC_OPCODE_MAX;
834 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
835 new_inst->U.I.DstReg.Index = temp;
836 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
837 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
838 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
839
840 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
841 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
842 inst->U.I.SrcReg[i].Index = temp;
843 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
844 }
845 }
846 return 1;
847 }
848
849 /**
850 * Vertex engine cannot read two inputs or two constants at the same time.
851 * Introduce intermediate MOVs to temporary registers to account for this.
852 */
853 static int transform_source_conflicts(
854 struct radeon_compiler *c,
855 struct rc_instruction* inst,
856 void* unused)
857 {
858 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
859
860 if (opcode->NumSrcRegs == 3) {
861 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
862 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
863 int tmpreg = rc_find_free_temporary(c);
864 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
865 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
866 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
867 inst_mov->U.I.DstReg.Index = tmpreg;
868 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
869
870 reset_srcreg(&inst->U.I.SrcReg[2]);
871 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
872 inst->U.I.SrcReg[2].Index = tmpreg;
873 }
874 }
875
876 if (opcode->NumSrcRegs >= 2) {
877 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
878 int tmpreg = rc_find_free_temporary(c);
879 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
880 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
881 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
882 inst_mov->U.I.DstReg.Index = tmpreg;
883 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
884
885 reset_srcreg(&inst->U.I.SrcReg[1]);
886 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
887 inst->U.I.SrcReg[1].Index = tmpreg;
888 }
889 }
890
891 return 1;
892 }
893
894 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
895 {
896 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
897 int i;
898
899 for(i = 0; i < 32; ++i) {
900 if ((compiler->RequiredOutputs & (1 << i)) &&
901 !(compiler->Base.Program.OutputsWritten & (1 << i))) {
902 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
903 inst->U.I.Opcode = RC_OPCODE_MOV;
904
905 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
906 inst->U.I.DstReg.Index = i;
907 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
908
909 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
910 inst->U.I.SrcReg[0].Index = 0;
911 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
912
913 compiler->Base.Program.OutputsWritten |= 1 << i;
914 }
915 }
916 }
917
918 static void dataflow_outputs_mark_used(void * userdata, void * data,
919 void (*callback)(void *, unsigned int, unsigned int))
920 {
921 struct r300_vertex_program_compiler * c = userdata;
922 int i;
923
924 for(i = 0; i < 32; ++i) {
925 if (c->RequiredOutputs & (1 << i))
926 callback(data, i, RC_MASK_XYZW);
927 }
928 }
929
930 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
931 {
932 (void) opcode;
933 (void) reg;
934
935 return 1;
936 }
937
938 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
939 struct rc_instruction *arl,
940 struct rc_instruction *end,
941 int min_offset)
942 {
943 struct rc_instruction *inst, *add;
944 unsigned const_swizzle;
945
946 /* Transform ARL */
947 add = rc_insert_new_instruction(&c->Base, arl->Prev);
948 add->U.I.Opcode = RC_OPCODE_ADD;
949 add->U.I.DstReg.File = RC_FILE_TEMPORARY;
950 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
951 add->U.I.DstReg.WriteMask = RC_MASK_X;
952 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
953 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
954 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
955 min_offset, &const_swizzle);
956 add->U.I.SrcReg[1].Swizzle = const_swizzle;
957
958 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
959 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
960 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
961
962 /* Rewrite offsets up to and excluding inst. */
963 for (inst = arl->Next; inst != end; inst = inst->Next) {
964 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
965
966 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
967 if (inst->U.I.SrcReg[i].RelAddr)
968 inst->U.I.SrcReg[i].Index -= min_offset;
969 }
970 }
971
972 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
973 {
974 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
975 struct rc_instruction *inst, *lastARL = NULL;
976 int min_offset = 0;
977
978 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
979 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
980
981 if (inst->U.I.Opcode == RC_OPCODE_ARL) {
982 if (lastARL != NULL && min_offset < 0)
983 transform_negative_addressing(c, lastARL, inst, min_offset);
984
985 lastARL = inst;
986 min_offset = 0;
987 continue;
988 }
989
990 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
991 if (inst->U.I.SrcReg[i].RelAddr &&
992 inst->U.I.SrcReg[i].Index < 0) {
993 /* ARL must precede any indirect addressing. */
994 if (lastARL == NULL) {
995 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
996 return;
997 }
998
999 if (inst->U.I.SrcReg[i].Index < min_offset)
1000 min_offset = inst->U.I.SrcReg[i].Index;
1001 }
1002 }
1003 }
1004
1005 if (lastARL != NULL && min_offset < 0)
1006 transform_negative_addressing(c, lastARL, inst, min_offset);
1007 }
1008
1009 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
1010 .IsNative = &swizzle_is_native,
1011 .Split = 0 /* should never be called */
1012 };
1013
1014 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
1015 {
1016 int is_r500 = c->Base.is_r500;
1017 int kill_consts = c->Base.remove_unused_constants;
1018 int opt = !c->Base.disable_optimizations;
1019
1020 /* Lists of instruction transformations. */
1021 struct radeon_program_transformation alu_rewrite_r500[] = {
1022 { &r300_transform_vertex_alu, 0 },
1023 { &r300_transform_trig_scale_vertex, 0 },
1024 { 0, 0 }
1025 };
1026
1027 struct radeon_program_transformation alu_rewrite_r300[] = {
1028 { &r300_transform_vertex_alu, 0 },
1029 { &r300_transform_trig_simple, 0 },
1030 { 0, 0 }
1031 };
1032
1033 /* Note: These passes have to be done seperately from ALU rewrite,
1034 * otherwise non-native ALU instructions with source conflits
1035 * or non-native modifiers will not be treated properly.
1036 */
1037 struct radeon_program_transformation emulate_modifiers[] = {
1038 { &transform_nonnative_modifiers, 0 },
1039 { 0, 0 }
1040 };
1041
1042 struct radeon_program_transformation resolve_src_conflicts[] = {
1043 { &transform_source_conflicts, 0 },
1044 { 0, 0 }
1045 };
1046
1047 /* List of compiler passes. */
1048 struct radeon_compiler_pass vs_list[] = {
1049 /* NAME DUMP PREDICATE FUNCTION PARAM */
1050 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
1051 {"transform loops", 1, 1, rc_transform_loops, NULL},
1052 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
1053 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
1054 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
1055 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
1056 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
1057 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
1058 {"dataflow optimize", 1, opt, rc_optimize, NULL},
1059 /* This pass must be done after optimizations. */
1060 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
1061 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
1062 {"dead constants", 1, kill_consts, rc_remove_unused_constants, &c->code->constants_remap_table},
1063 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
1064 {"machine code generation", 0, 1, translate_vertex_program, NULL},
1065 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
1066 {NULL, 0, 0, NULL, NULL}
1067 };
1068
1069 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
1070
1071 rc_run_compiler(&c->Base, vs_list, "Vertex Program");
1072
1073 c->code->InputsRead = c->Base.Program.InputsRead;
1074 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
1075 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
1076 }