Merge branch 'llvm-cliptest-viewport'
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_dataflow.h"
30 #include "radeon_program_alu.h"
31 #include "radeon_swizzle.h"
32 #include "radeon_emulate_branches.h"
33 #include "radeon_emulate_loops.h"
34 #include "radeon_remove_constants.h"
35
36 struct loop {
37 int BgnLoop;
38
39 };
40
41 /*
42 * Take an already-setup and valid source then swizzle it appropriately to
43 * obtain a constant ZERO or ONE source.
44 */
45 #define __CONST(x, y) \
46 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
47 t_swizzle(y), \
48 t_swizzle(y), \
49 t_swizzle(y), \
50 t_swizzle(y), \
51 t_src_class(vpi->SrcReg[x].File), \
52 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53
54
55 static unsigned long t_dst_mask(unsigned int mask)
56 {
57 /* RC_MASK_* is equivalent to VSF_FLAG_* */
58 return mask & RC_MASK_XYZW;
59 }
60
61 static unsigned long t_dst_class(rc_register_file file)
62 {
63 switch (file) {
64 default:
65 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
66 /* fall-through */
67 case RC_FILE_TEMPORARY:
68 return PVS_DST_REG_TEMPORARY;
69 case RC_FILE_OUTPUT:
70 return PVS_DST_REG_OUT;
71 case RC_FILE_ADDRESS:
72 return PVS_DST_REG_A0;
73 }
74 }
75
76 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77 struct rc_dst_register *dst)
78 {
79 if (dst->File == RC_FILE_OUTPUT)
80 return vp->outputs[dst->Index];
81
82 return dst->Index;
83 }
84
85 static unsigned long t_src_class(rc_register_file file)
86 {
87 switch (file) {
88 default:
89 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
90 /* fall-through */
91 case RC_FILE_NONE:
92 case RC_FILE_TEMPORARY:
93 return PVS_SRC_REG_TEMPORARY;
94 case RC_FILE_INPUT:
95 return PVS_SRC_REG_INPUT;
96 case RC_FILE_CONSTANT:
97 return PVS_SRC_REG_CONSTANT;
98 }
99 }
100
101 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102 {
103 unsigned long aclass = t_src_class(a.File);
104 unsigned long bclass = t_src_class(b.File);
105
106 if (aclass != bclass)
107 return 0;
108 if (aclass == PVS_SRC_REG_TEMPORARY)
109 return 0;
110
111 if (a.RelAddr || b.RelAddr)
112 return 1;
113 if (a.Index != b.Index)
114 return 1;
115
116 return 0;
117 }
118
119 static inline unsigned long t_swizzle(unsigned int swizzle)
120 {
121 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 return swizzle;
123 }
124
125 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126 struct rc_src_register *src)
127 {
128 if (src->File == RC_FILE_INPUT) {
129 assert(vp->inputs[src->Index] != -1);
130 return vp->inputs[src->Index];
131 } else {
132 if (src->Index < 0) {
133 fprintf(stderr,
134 "negative offsets for indirect addressing do not work.\n");
135 return 0;
136 }
137 return src->Index;
138 }
139 }
140
141 /* these two functions should probably be merged... */
142
143 static unsigned long t_src(struct r300_vertex_program_code *vp,
144 struct rc_src_register *src)
145 {
146 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 */
149 return PVS_SRC_OPERAND(t_src_index(vp, src),
150 t_swizzle(GET_SWZ(src->Swizzle, 0)),
151 t_swizzle(GET_SWZ(src->Swizzle, 1)),
152 t_swizzle(GET_SWZ(src->Swizzle, 2)),
153 t_swizzle(GET_SWZ(src->Swizzle, 3)),
154 t_src_class(src->File),
155 src->Negate) |
156 (src->RelAddr << 4) | (src->Abs << 3);
157 }
158
159 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160 struct rc_src_register *src)
161 {
162 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 */
165 return PVS_SRC_OPERAND(t_src_index(vp, src),
166 t_swizzle(GET_SWZ(src->Swizzle, 0)),
167 t_swizzle(GET_SWZ(src->Swizzle, 0)),
168 t_swizzle(GET_SWZ(src->Swizzle, 0)),
169 t_swizzle(GET_SWZ(src->Swizzle, 0)),
170 t_src_class(src->File),
171 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
172 (src->RelAddr << 4) | (src->Abs << 3);
173 }
174
175 static int valid_dst(struct r300_vertex_program_code *vp,
176 struct rc_dst_register *dst)
177 {
178 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
179 return 0;
180 } else if (dst->File == RC_FILE_ADDRESS) {
181 assert(dst->Index == 0);
182 }
183
184 return 1;
185 }
186
187 static void ei_vector1(struct r300_vertex_program_code *vp,
188 unsigned int hw_opcode,
189 struct rc_sub_instruction *vpi,
190 unsigned int * inst)
191 {
192 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
193 0,
194 0,
195 t_dst_index(vp, &vpi->DstReg),
196 t_dst_mask(vpi->DstReg.WriteMask),
197 t_dst_class(vpi->DstReg.File));
198 inst[1] = t_src(vp, &vpi->SrcReg[0]);
199 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
200 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
201 }
202
203 static void ei_vector2(struct r300_vertex_program_code *vp,
204 unsigned int hw_opcode,
205 struct rc_sub_instruction *vpi,
206 unsigned int * inst)
207 {
208 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
209 0,
210 0,
211 t_dst_index(vp, &vpi->DstReg),
212 t_dst_mask(vpi->DstReg.WriteMask),
213 t_dst_class(vpi->DstReg.File));
214 inst[1] = t_src(vp, &vpi->SrcReg[0]);
215 inst[2] = t_src(vp, &vpi->SrcReg[1]);
216 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
217 }
218
219 static void ei_math1(struct r300_vertex_program_code *vp,
220 unsigned int hw_opcode,
221 struct rc_sub_instruction *vpi,
222 unsigned int * inst)
223 {
224 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
225 1,
226 0,
227 t_dst_index(vp, &vpi->DstReg),
228 t_dst_mask(vpi->DstReg.WriteMask),
229 t_dst_class(vpi->DstReg.File));
230 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
231 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
232 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
233 }
234
235 static void ei_lit(struct r300_vertex_program_code *vp,
236 struct rc_sub_instruction *vpi,
237 unsigned int * inst)
238 {
239 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
240
241 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
242 1,
243 0,
244 t_dst_index(vp, &vpi->DstReg),
245 t_dst_mask(vpi->DstReg.WriteMask),
246 t_dst_class(vpi->DstReg.File));
247 /* NOTE: Users swizzling might not work. */
248 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
249 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
250 PVS_SRC_SELECT_FORCE_0, // Z
251 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
252 t_src_class(vpi->SrcReg[0].File),
253 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
254 (vpi->SrcReg[0].RelAddr << 4);
255 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
256 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
257 PVS_SRC_SELECT_FORCE_0, // Z
258 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
259 t_src_class(vpi->SrcReg[0].File),
260 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
261 (vpi->SrcReg[0].RelAddr << 4);
262 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
263 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
264 PVS_SRC_SELECT_FORCE_0, // Z
265 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
266 t_src_class(vpi->SrcReg[0].File),
267 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
268 (vpi->SrcReg[0].RelAddr << 4);
269 }
270
271 static void ei_mad(struct r300_vertex_program_code *vp,
272 struct rc_sub_instruction *vpi,
273 unsigned int * inst)
274 {
275 /* Remarks about hardware limitations of MAD
276 * (please preserve this comment, as this information is _NOT_
277 * in the documentation provided by AMD).
278 *
279 * As described in the documentation, MAD with three unique temporary
280 * source registers requires the use of the macro version.
281 *
282 * However (and this is not mentioned in the documentation), apparently
283 * the macro version is _NOT_ a full superset of the normal version.
284 * In particular, the macro version does not always work when relative
285 * addressing is used in the source operands.
286 *
287 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
288 * assembly shader path when using medium quality animations
289 * (i.e. animations with matrix blending instead of quaternion blending).
290 *
291 * Unfortunately, I (nha) have been unable to extract a Piglit regression
292 * test for this issue - for some reason, it is possible to have vertex
293 * programs whose prefix is *exactly* the same as the prefix of the
294 * offending program in Sauerbraten up to the offending instruction
295 * without causing any trouble.
296 *
297 * Bottom line: Only use the macro version only when really necessary;
298 * according to AMD docs, this should improve performance by one clock
299 * as a nice side bonus.
300 */
301 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
302 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
303 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
304 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
305 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
306 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
307 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
308 0,
309 1,
310 t_dst_index(vp, &vpi->DstReg),
311 t_dst_mask(vpi->DstReg.WriteMask),
312 t_dst_class(vpi->DstReg.File));
313 } else {
314 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
315 0,
316 0,
317 t_dst_index(vp, &vpi->DstReg),
318 t_dst_mask(vpi->DstReg.WriteMask),
319 t_dst_class(vpi->DstReg.File));
320 }
321 inst[1] = t_src(vp, &vpi->SrcReg[0]);
322 inst[2] = t_src(vp, &vpi->SrcReg[1]);
323 inst[3] = t_src(vp, &vpi->SrcReg[2]);
324 }
325
326 static void ei_pow(struct r300_vertex_program_code *vp,
327 struct rc_sub_instruction *vpi,
328 unsigned int * inst)
329 {
330 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
331 1,
332 0,
333 t_dst_index(vp, &vpi->DstReg),
334 t_dst_mask(vpi->DstReg.WriteMask),
335 t_dst_class(vpi->DstReg.File));
336 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
337 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
338 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
339 }
340
341 static void mark_write(void * userdata, struct rc_instruction * inst,
342 rc_register_file file, unsigned int index, unsigned int mask)
343 {
344 unsigned int * writemasks = userdata;
345
346 if (file != RC_FILE_TEMPORARY)
347 return;
348
349 if (index >= R300_VS_MAX_TEMPS)
350 return;
351
352 writemasks[index] |= mask;
353 }
354
355 static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
356 {
357 return PVS_SRC_OPERAND(compiler->PredicateIndex,
358 t_swizzle(RC_SWIZZLE_ZERO),
359 t_swizzle(RC_SWIZZLE_ZERO),
360 t_swizzle(RC_SWIZZLE_ZERO),
361 t_swizzle(RC_SWIZZLE_W),
362 t_src_class(RC_FILE_TEMPORARY),
363 0);
364 }
365
366 static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
367 unsigned int hw_opcode, int is_math)
368 {
369 return PVS_OP_DST_OPERAND(hw_opcode,
370 is_math,
371 0,
372 compiler->PredicateIndex,
373 RC_MASK_W,
374 t_dst_class(RC_FILE_TEMPORARY));
375
376 }
377
378 static void ei_if(struct r300_vertex_program_compiler * compiler,
379 struct rc_instruction *rci,
380 unsigned int * inst,
381 unsigned int branch_depth)
382 {
383 unsigned int predicate_opcode;
384 int is_math = 0;
385
386 if (!compiler->Base.is_r500) {
387 rc_error(&compiler->Base,"Opcode IF not supported\n");
388 return;
389 }
390
391 /* Reserve a temporary to use as our predicate stack counter, if we
392 * don't already have one. */
393 if (!compiler->PredicateMask) {
394 unsigned int writemasks[RC_REGISTER_MAX_INDEX];
395 struct rc_instruction * inst;
396 unsigned int i;
397 memset(writemasks, 0, sizeof(writemasks));
398 for(inst = compiler->Base.Program.Instructions.Next;
399 inst != &compiler->Base.Program.Instructions;
400 inst = inst->Next) {
401 rc_for_all_writes_mask(inst, mark_write, writemasks);
402 }
403 for(i = 0; i < compiler->Base.max_temp_regs; i++) {
404 unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
405 /* Only the W component can be used fo the predicate
406 * stack counter. */
407 if (mask & RC_MASK_W) {
408 compiler->PredicateMask = RC_MASK_W;
409 compiler->PredicateIndex = i;
410 break;
411 }
412 }
413 if (i == compiler->Base.max_temp_regs) {
414 rc_error(&compiler->Base, "No free temporary to use for"
415 " predicate stack counter.\n");
416 return;
417 }
418 }
419 predicate_opcode =
420 branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
421
422 rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
423 if (branch_depth == 0) {
424 is_math = 1;
425 predicate_opcode = ME_PRED_SET_NEQ;
426 inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
427 inst[2] = 0;
428 } else {
429 predicate_opcode = VE_PRED_SET_NEQ_PUSH;
430 inst[1] = t_pred_src(compiler);
431 inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
432 }
433
434 inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
435 inst[3] = 0;
436
437 }
438
439 static void ei_else(struct r300_vertex_program_compiler * compiler,
440 unsigned int * inst)
441 {
442 if (!compiler->Base.is_r500) {
443 rc_error(&compiler->Base,"Opcode ELSE not supported\n");
444 return;
445 }
446 inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
447 inst[1] = t_pred_src(compiler);
448 inst[2] = 0;
449 inst[3] = 0;
450 }
451
452 static void ei_endif(struct r300_vertex_program_compiler *compiler,
453 unsigned int * inst)
454 {
455 if (!compiler->Base.is_r500) {
456 rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
457 return;
458 }
459 inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
460 inst[1] = t_pred_src(compiler);
461 inst[2] = 0;
462 inst[3] = 0;
463 }
464
465 static void translate_vertex_program(struct radeon_compiler *c, void *user)
466 {
467 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
468 struct rc_instruction *rci;
469
470 struct loop * loops = NULL;
471 int current_loop_depth = 0;
472 int loops_reserved = 0;
473
474 unsigned int branch_depth = 0;
475
476 compiler->code->pos_end = 0; /* Not supported yet */
477 compiler->code->length = 0;
478 compiler->code->num_temporaries = 0;
479
480 compiler->SetHwInputOutput(compiler);
481
482 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
483 struct rc_sub_instruction *vpi = &rci->U.I;
484 unsigned int *inst = compiler->code->body.d + compiler->code->length;
485 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
486
487 /* Skip instructions writing to non-existing destination */
488 if (!valid_dst(compiler->code, &vpi->DstReg))
489 continue;
490
491 if (info->HasDstReg) {
492 /* Relative addressing of destination operands is not supported yet. */
493 if (vpi->DstReg.RelAddr) {
494 rc_error(&compiler->Base, "Vertex program does not support relative "
495 "addressing of destination operands (yet).\n");
496 return;
497 }
498
499 /* Neither is Saturate. */
500 if (vpi->SaturateMode != RC_SATURATE_NONE) {
501 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
502 "modifier (yet).\n");
503 }
504 }
505
506 if (compiler->code->length >= c->max_alu_insts * 4) {
507 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
508 return;
509 }
510
511 assert(compiler->Base.is_r500 ||
512 (vpi->Opcode != RC_OPCODE_SEQ &&
513 vpi->Opcode != RC_OPCODE_SNE));
514
515 switch (vpi->Opcode) {
516 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
517 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
518 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
519 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
520 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
521 case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
522 case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
523 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
524 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
525 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
526 case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
527 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
528 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
529 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
530 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
531 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
532 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
533 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
534 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
535 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
536 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
537 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
538 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
539 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
540 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
541 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
542 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
543 case RC_OPCODE_BGNLOOP:
544 {
545 struct loop * l;
546
547 if ((!compiler->Base.is_r500
548 && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
549 || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
550 rc_error(&compiler->Base,
551 "Loops are nested too deep.");
552 return;
553 }
554 memory_pool_array_reserve(&compiler->Base.Pool,
555 struct loop, loops, current_loop_depth,
556 loops_reserved, 1);
557 l = &loops[current_loop_depth++];
558 memset(l , 0, sizeof(struct loop));
559 l->BgnLoop = (compiler->code->length / 4);
560 continue;
561 }
562 case RC_OPCODE_ENDLOOP:
563 {
564 struct loop * l;
565 unsigned int act_addr;
566 unsigned int last_addr;
567 unsigned int ret_addr;
568
569 assert(loops);
570 l = &loops[current_loop_depth - 1];
571 act_addr = l->BgnLoop - 1;
572 last_addr = (compiler->code->length / 4) - 1;
573 ret_addr = l->BgnLoop;
574
575 if (loops_reserved >= R300_VS_MAX_FC_OPS) {
576 rc_error(&compiler->Base,
577 "Too many flow control instructions.");
578 return;
579 }
580 if (compiler->Base.is_r500) {
581 compiler->code->fc_op_addrs.r500
582 [compiler->code->num_fc_ops].lw =
583 R500_PVS_FC_ACT_ADRS(act_addr)
584 | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
585 ;
586 compiler->code->fc_op_addrs.r500
587 [compiler->code->num_fc_ops].uw =
588 R500_PVS_FC_LAST_INST(last_addr)
589 | R500_PVS_FC_RTN_INST(ret_addr)
590 ;
591 } else {
592 compiler->code->fc_op_addrs.r300
593 [compiler->code->num_fc_ops] =
594 R300_PVS_FC_ACT_ADRS(act_addr)
595 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
596 | R300_PVS_FC_LAST_INST(last_addr)
597 | R300_PVS_FC_RTN_INST(ret_addr)
598 ;
599 }
600 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
601 R300_PVS_FC_LOOP_INIT_VAL(0x0)
602 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
603 ;
604 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
605 compiler->code->num_fc_ops);
606 compiler->code->num_fc_ops++;
607 current_loop_depth--;
608 continue;
609 }
610
611 default:
612 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
613 return;
614 }
615
616 /* Non-flow control instructions that are inside an if statement
617 * need to pay attention to the predicate bit. */
618 if (branch_depth
619 && vpi->Opcode != RC_OPCODE_IF
620 && vpi->Opcode != RC_OPCODE_ELSE
621 && vpi->Opcode != RC_OPCODE_ENDIF) {
622
623 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
624 << PVS_DST_PRED_ENABLE_SHIFT);
625 inst[0] |= (PVS_DST_PRED_SENSE_MASK
626 << PVS_DST_PRED_SENSE_SHIFT);
627 }
628
629 /* Update the number of temporaries. */
630 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
631 vpi->DstReg.Index >= compiler->code->num_temporaries)
632 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
633
634 for (unsigned i = 0; i < info->NumSrcRegs; i++)
635 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
636 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
637 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
638
639 if (compiler->PredicateMask)
640 if (compiler->PredicateIndex >= compiler->code->num_temporaries)
641 compiler->code->num_temporaries = compiler->PredicateIndex + 1;
642
643 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
644 rc_error(&compiler->Base, "Too many temporaries.\n");
645 return;
646 }
647
648 compiler->code->length += 4;
649
650 if (compiler->Base.Error)
651 return;
652 }
653 }
654
655 struct temporary_allocation {
656 unsigned int Allocated:1;
657 unsigned int HwTemp:15;
658 struct rc_instruction * LastRead;
659 };
660
661 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
662 {
663 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
664 struct rc_instruction *inst;
665 struct rc_instruction *end_loop = NULL;
666 unsigned int num_orig_temps = 0;
667 char hwtemps[RC_REGISTER_MAX_INDEX];
668 struct temporary_allocation * ta;
669 unsigned int i, j;
670 struct rc_instruction *last_inst_src_reladdr = NULL;
671
672 memset(hwtemps, 0, sizeof(hwtemps));
673
674 rc_recompute_ips(c);
675
676 /* Pass 1: Count original temporaries. */
677 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
678 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
679
680 for (i = 0; i < opcode->NumSrcRegs; ++i) {
681 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
682 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
683 num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
684 }
685 }
686
687 if (opcode->HasDstReg) {
688 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
689 if (inst->U.I.DstReg.Index >= num_orig_temps)
690 num_orig_temps = inst->U.I.DstReg.Index + 1;
691 }
692 }
693 }
694
695 /* Pass 2: If there is relative addressing of dst temporaries, we cannot change register indices. Give up.
696 * For src temporaries, save the last instruction which uses relative addressing. */
697 for (inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
698 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
699
700 if (opcode->HasDstReg)
701 if (inst->U.I.DstReg.RelAddr)
702 return;
703
704 for (i = 0; i < opcode->NumSrcRegs; ++i) {
705 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
706 inst->U.I.SrcReg[i].RelAddr) {
707 last_inst_src_reladdr = inst;
708 }
709 }
710 }
711
712 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
713 sizeof(struct temporary_allocation) * num_orig_temps);
714 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
715
716 /* Pass 3: Determine original temporary lifetimes */
717 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
718 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
719 /* Instructions inside of loops need to use the ENDLOOP
720 * instruction as their LastRead. */
721 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
722 int endloops = 1;
723 struct rc_instruction * ptr;
724 for(ptr = inst->Next;
725 ptr != &compiler->Base.Program.Instructions;
726 ptr = ptr->Next){
727 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
728 endloops++;
729 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
730 endloops--;
731 if (endloops <= 0) {
732 end_loop = ptr;
733 break;
734 }
735 }
736 }
737 }
738
739 if (inst == end_loop) {
740 end_loop = NULL;
741 continue;
742 }
743
744 for (i = 0; i < opcode->NumSrcRegs; ++i) {
745 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
746 struct rc_instruction *last_read;
747
748 /* From "last_inst_src_reladdr", "end_loop", and "inst",
749 * select the instruction with the highest instruction index (IP).
750 * Note that "end_loop", if available, has always a higher index than "inst". */
751 if (last_inst_src_reladdr) {
752 if (end_loop) {
753 last_read = last_inst_src_reladdr->IP > end_loop->IP ?
754 last_inst_src_reladdr : end_loop;
755 } else {
756 last_read = last_inst_src_reladdr->IP > inst->IP ?
757 last_inst_src_reladdr : inst;
758 }
759 } else {
760 last_read = end_loop ? end_loop : inst;
761 }
762
763 ta[inst->U.I.SrcReg[i].Index].LastRead = last_read;
764 }
765 }
766 }
767
768 /* Pass 4: Register allocation */
769 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
770 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
771
772 if (!last_inst_src_reladdr || last_inst_src_reladdr->IP < inst->IP) {
773 for (i = 0; i < opcode->NumSrcRegs; ++i) {
774 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
775 unsigned int orig = inst->U.I.SrcReg[i].Index;
776 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
777
778 if (ta[orig].Allocated && inst == ta[orig].LastRead)
779 hwtemps[ta[orig].HwTemp] = 0;
780 }
781 }
782 }
783
784 if (opcode->HasDstReg) {
785 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
786 unsigned int orig = inst->U.I.DstReg.Index;
787
788 if (!ta[orig].Allocated) {
789 for(j = 0; j < c->max_temp_regs; ++j) {
790 if (!hwtemps[j])
791 break;
792 }
793 if (j >= c->max_temp_regs) {
794 rc_error(c, "Too many temporaries\n");
795 return;
796 } else {
797 ta[orig].Allocated = 1;
798 if (last_inst_src_reladdr &&
799 last_inst_src_reladdr->IP > inst->IP) {
800 ta[orig].HwTemp = orig;
801 } else {
802 ta[orig].HwTemp = j;
803 }
804 hwtemps[ta[orig].HwTemp] = 1;
805 }
806 }
807
808 inst->U.I.DstReg.Index = ta[orig].HwTemp;
809 }
810 }
811 }
812 }
813
814 /**
815 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
816 * and the Saturate opcode modifier. Only Absolute is currently transformed.
817 */
818 static int transform_nonnative_modifiers(
819 struct radeon_compiler *c,
820 struct rc_instruction *inst,
821 void* unused)
822 {
823 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
824 unsigned i;
825
826 /* Transform ABS(a) to MAX(a, -a). */
827 for (i = 0; i < opcode->NumSrcRegs; i++) {
828 if (inst->U.I.SrcReg[i].Abs) {
829 struct rc_instruction *new_inst;
830 unsigned temp;
831
832 inst->U.I.SrcReg[i].Abs = 0;
833
834 temp = rc_find_free_temporary(c);
835
836 new_inst = rc_insert_new_instruction(c, inst->Prev);
837 new_inst->U.I.Opcode = RC_OPCODE_MAX;
838 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
839 new_inst->U.I.DstReg.Index = temp;
840 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
841 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
842 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
843
844 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
845 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
846 inst->U.I.SrcReg[i].Index = temp;
847 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
848 }
849 }
850 return 1;
851 }
852
853 /**
854 * Vertex engine cannot read two inputs or two constants at the same time.
855 * Introduce intermediate MOVs to temporary registers to account for this.
856 */
857 static int transform_source_conflicts(
858 struct radeon_compiler *c,
859 struct rc_instruction* inst,
860 void* unused)
861 {
862 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
863
864 if (opcode->NumSrcRegs == 3) {
865 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
866 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
867 int tmpreg = rc_find_free_temporary(c);
868 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
869 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
870 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
871 inst_mov->U.I.DstReg.Index = tmpreg;
872 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
873
874 reset_srcreg(&inst->U.I.SrcReg[2]);
875 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
876 inst->U.I.SrcReg[2].Index = tmpreg;
877 }
878 }
879
880 if (opcode->NumSrcRegs >= 2) {
881 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
882 int tmpreg = rc_find_free_temporary(c);
883 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
884 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
885 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
886 inst_mov->U.I.DstReg.Index = tmpreg;
887 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
888
889 reset_srcreg(&inst->U.I.SrcReg[1]);
890 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
891 inst->U.I.SrcReg[1].Index = tmpreg;
892 }
893 }
894
895 return 1;
896 }
897
898 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
899 {
900 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
901 int i;
902
903 for(i = 0; i < 32; ++i) {
904 if ((compiler->RequiredOutputs & (1 << i)) &&
905 !(compiler->Base.Program.OutputsWritten & (1 << i))) {
906 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
907 inst->U.I.Opcode = RC_OPCODE_MOV;
908
909 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
910 inst->U.I.DstReg.Index = i;
911 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
912
913 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
914 inst->U.I.SrcReg[0].Index = 0;
915 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
916
917 compiler->Base.Program.OutputsWritten |= 1 << i;
918 }
919 }
920 }
921
922 static void dataflow_outputs_mark_used(void * userdata, void * data,
923 void (*callback)(void *, unsigned int, unsigned int))
924 {
925 struct r300_vertex_program_compiler * c = userdata;
926 int i;
927
928 for(i = 0; i < 32; ++i) {
929 if (c->RequiredOutputs & (1 << i))
930 callback(data, i, RC_MASK_XYZW);
931 }
932 }
933
934 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
935 {
936 (void) opcode;
937 (void) reg;
938
939 return 1;
940 }
941
942 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
943 struct rc_instruction *arl,
944 struct rc_instruction *end,
945 int min_offset)
946 {
947 struct rc_instruction *inst, *add;
948 unsigned const_swizzle;
949
950 /* Transform ARL */
951 add = rc_insert_new_instruction(&c->Base, arl->Prev);
952 add->U.I.Opcode = RC_OPCODE_ADD;
953 add->U.I.DstReg.File = RC_FILE_TEMPORARY;
954 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
955 add->U.I.DstReg.WriteMask = RC_MASK_X;
956 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
957 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
958 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
959 min_offset, &const_swizzle);
960 add->U.I.SrcReg[1].Swizzle = const_swizzle;
961
962 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
963 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
964 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
965
966 /* Rewrite offsets up to and excluding inst. */
967 for (inst = arl->Next; inst != end; inst = inst->Next) {
968 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
969
970 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
971 if (inst->U.I.SrcReg[i].RelAddr)
972 inst->U.I.SrcReg[i].Index -= min_offset;
973 }
974 }
975
976 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
977 {
978 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
979 struct rc_instruction *inst, *lastARL = NULL;
980 int min_offset = 0;
981
982 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
983 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
984
985 if (inst->U.I.Opcode == RC_OPCODE_ARL) {
986 if (lastARL != NULL && min_offset < 0)
987 transform_negative_addressing(c, lastARL, inst, min_offset);
988
989 lastARL = inst;
990 min_offset = 0;
991 continue;
992 }
993
994 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
995 if (inst->U.I.SrcReg[i].RelAddr &&
996 inst->U.I.SrcReg[i].Index < 0) {
997 /* ARL must precede any indirect addressing. */
998 if (lastARL == NULL) {
999 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
1000 return;
1001 }
1002
1003 if (inst->U.I.SrcReg[i].Index < min_offset)
1004 min_offset = inst->U.I.SrcReg[i].Index;
1005 }
1006 }
1007 }
1008
1009 if (lastARL != NULL && min_offset < 0)
1010 transform_negative_addressing(c, lastARL, inst, min_offset);
1011 }
1012
1013 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
1014 .IsNative = &swizzle_is_native,
1015 .Split = 0 /* should never be called */
1016 };
1017
1018 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
1019 {
1020 int is_r500 = c->Base.is_r500;
1021 int kill_consts = c->Base.remove_unused_constants;
1022 int opt = !c->Base.disable_optimizations;
1023
1024 /* Lists of instruction transformations. */
1025 struct radeon_program_transformation alu_rewrite_r500[] = {
1026 { &r300_transform_vertex_alu, 0 },
1027 { &r300_transform_trig_scale_vertex, 0 },
1028 { 0, 0 }
1029 };
1030
1031 struct radeon_program_transformation alu_rewrite_r300[] = {
1032 { &r300_transform_vertex_alu, 0 },
1033 { &r300_transform_trig_simple, 0 },
1034 { 0, 0 }
1035 };
1036
1037 /* Note: These passes have to be done seperately from ALU rewrite,
1038 * otherwise non-native ALU instructions with source conflits
1039 * or non-native modifiers will not be treated properly.
1040 */
1041 struct radeon_program_transformation emulate_modifiers[] = {
1042 { &transform_nonnative_modifiers, 0 },
1043 { 0, 0 }
1044 };
1045
1046 struct radeon_program_transformation resolve_src_conflicts[] = {
1047 { &transform_source_conflicts, 0 },
1048 { 0, 0 }
1049 };
1050
1051 /* List of compiler passes. */
1052 struct radeon_compiler_pass vs_list[] = {
1053 /* NAME DUMP PREDICATE FUNCTION PARAM */
1054 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
1055 {"transform loops", 1, 1, rc_transform_loops, NULL},
1056 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
1057 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
1058 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
1059 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
1060 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
1061 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used},
1062 {"dataflow optimize", 1, opt, rc_optimize, NULL},
1063 /* This pass must be done after optimizations. */
1064 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
1065 {"dataflow swizzles", 1, 1, rc_dataflow_swizzles, NULL},
1066 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
1067 {"dead constants", 1, kill_consts, rc_remove_unused_constants, &c->code->constants_remap_table},
1068 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
1069 {"machine code generation", 0, 1, translate_vertex_program, NULL},
1070 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
1071 {NULL, 0, 0, NULL, NULL}
1072 };
1073
1074 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
1075
1076 rc_run_compiler(&c->Base, vs_list, "Vertex Program");
1077
1078 c->code->InputsRead = c->Base.Program.InputsRead;
1079 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
1080 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
1081 }