2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23 #include "radeon_compiler.h"
27 #include "../r300_reg.h"
29 #include "radeon_dataflow.h"
30 #include "radeon_program_alu.h"
31 #include "radeon_swizzle.h"
32 #include "radeon_emulate_branches.h"
33 #include "radeon_emulate_loops.h"
34 #include "radeon_remove_constants.h"
42 * Take an already-setup and valid source then swizzle it appropriately to
43 * obtain a constant ZERO or ONE source.
45 #define __CONST(x, y) \
46 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
51 t_src_class(vpi->SrcReg[x].File), \
52 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
55 static unsigned long t_dst_mask(unsigned int mask
)
57 /* RC_MASK_* is equivalent to VSF_FLAG_* */
58 return mask
& RC_MASK_XYZW
;
61 static unsigned long t_dst_class(rc_register_file file
)
65 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
67 case RC_FILE_TEMPORARY
:
68 return PVS_DST_REG_TEMPORARY
;
70 return PVS_DST_REG_OUT
;
72 return PVS_DST_REG_A0
;
76 static unsigned long t_dst_index(struct r300_vertex_program_code
*vp
,
77 struct rc_dst_register
*dst
)
79 if (dst
->File
== RC_FILE_OUTPUT
)
80 return vp
->outputs
[dst
->Index
];
85 static unsigned long t_src_class(rc_register_file file
)
89 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
92 case RC_FILE_TEMPORARY
:
93 return PVS_SRC_REG_TEMPORARY
;
95 return PVS_SRC_REG_INPUT
;
96 case RC_FILE_CONSTANT
:
97 return PVS_SRC_REG_CONSTANT
;
101 static int t_src_conflict(struct rc_src_register a
, struct rc_src_register b
)
103 unsigned long aclass
= t_src_class(a
.File
);
104 unsigned long bclass
= t_src_class(b
.File
);
106 if (aclass
!= bclass
)
108 if (aclass
== PVS_SRC_REG_TEMPORARY
)
111 if (a
.RelAddr
|| b
.RelAddr
)
113 if (a
.Index
!= b
.Index
)
119 static inline unsigned long t_swizzle(unsigned int swizzle
)
121 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
125 static unsigned long t_src_index(struct r300_vertex_program_code
*vp
,
126 struct rc_src_register
*src
)
128 if (src
->File
== RC_FILE_INPUT
) {
129 assert(vp
->inputs
[src
->Index
] != -1);
130 return vp
->inputs
[src
->Index
];
132 if (src
->Index
< 0) {
134 "negative offsets for indirect addressing do not work.\n");
141 /* these two functions should probably be merged... */
143 static unsigned long t_src(struct r300_vertex_program_code
*vp
,
144 struct rc_src_register
*src
)
146 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
149 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
150 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
151 t_swizzle(GET_SWZ(src
->Swizzle
, 1)),
152 t_swizzle(GET_SWZ(src
->Swizzle
, 2)),
153 t_swizzle(GET_SWZ(src
->Swizzle
, 3)),
154 t_src_class(src
->File
),
156 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
159 static unsigned long t_src_scalar(struct r300_vertex_program_code
*vp
,
160 struct rc_src_register
*src
)
162 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
165 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
166 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
167 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
168 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
169 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
170 t_src_class(src
->File
),
171 src
->Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
172 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
175 static int valid_dst(struct r300_vertex_program_code
*vp
,
176 struct rc_dst_register
*dst
)
178 if (dst
->File
== RC_FILE_OUTPUT
&& vp
->outputs
[dst
->Index
] == -1) {
180 } else if (dst
->File
== RC_FILE_ADDRESS
) {
181 assert(dst
->Index
== 0);
187 static void ei_vector1(struct r300_vertex_program_code
*vp
,
188 unsigned int hw_opcode
,
189 struct rc_sub_instruction
*vpi
,
192 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
195 t_dst_index(vp
, &vpi
->DstReg
),
196 t_dst_mask(vpi
->DstReg
.WriteMask
),
197 t_dst_class(vpi
->DstReg
.File
));
198 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
199 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
200 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
203 static void ei_vector2(struct r300_vertex_program_code
*vp
,
204 unsigned int hw_opcode
,
205 struct rc_sub_instruction
*vpi
,
208 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
211 t_dst_index(vp
, &vpi
->DstReg
),
212 t_dst_mask(vpi
->DstReg
.WriteMask
),
213 t_dst_class(vpi
->DstReg
.File
));
214 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
215 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
216 inst
[3] = __CONST(1, RC_SWIZZLE_ZERO
);
219 static void ei_math1(struct r300_vertex_program_code
*vp
,
220 unsigned int hw_opcode
,
221 struct rc_sub_instruction
*vpi
,
224 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
227 t_dst_index(vp
, &vpi
->DstReg
),
228 t_dst_mask(vpi
->DstReg
.WriteMask
),
229 t_dst_class(vpi
->DstReg
.File
));
230 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
231 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
232 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
235 static void ei_lit(struct r300_vertex_program_code
*vp
,
236 struct rc_sub_instruction
*vpi
,
239 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
241 inst
[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX
,
244 t_dst_index(vp
, &vpi
->DstReg
),
245 t_dst_mask(vpi
->DstReg
.WriteMask
),
246 t_dst_class(vpi
->DstReg
.File
));
247 /* NOTE: Users swizzling might not work. */
248 inst
[1] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
249 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
250 PVS_SRC_SELECT_FORCE_0
, // Z
251 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
252 t_src_class(vpi
->SrcReg
[0].File
),
253 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
254 (vpi
->SrcReg
[0].RelAddr
<< 4);
255 inst
[2] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
256 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
257 PVS_SRC_SELECT_FORCE_0
, // Z
258 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
259 t_src_class(vpi
->SrcReg
[0].File
),
260 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
261 (vpi
->SrcReg
[0].RelAddr
<< 4);
262 inst
[3] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
263 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
264 PVS_SRC_SELECT_FORCE_0
, // Z
265 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
266 t_src_class(vpi
->SrcReg
[0].File
),
267 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
268 (vpi
->SrcReg
[0].RelAddr
<< 4);
271 static void ei_mad(struct r300_vertex_program_code
*vp
,
272 struct rc_sub_instruction
*vpi
,
275 /* Remarks about hardware limitations of MAD
276 * (please preserve this comment, as this information is _NOT_
277 * in the documentation provided by AMD).
279 * As described in the documentation, MAD with three unique temporary
280 * source registers requires the use of the macro version.
282 * However (and this is not mentioned in the documentation), apparently
283 * the macro version is _NOT_ a full superset of the normal version.
284 * In particular, the macro version does not always work when relative
285 * addressing is used in the source operands.
287 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
288 * assembly shader path when using medium quality animations
289 * (i.e. animations with matrix blending instead of quaternion blending).
291 * Unfortunately, I (nha) have been unable to extract a Piglit regression
292 * test for this issue - for some reason, it is possible to have vertex
293 * programs whose prefix is *exactly* the same as the prefix of the
294 * offending program in Sauerbraten up to the offending instruction
295 * without causing any trouble.
297 * Bottom line: Only use the macro version only when really necessary;
298 * according to AMD docs, this should improve performance by one clock
299 * as a nice side bonus.
301 if (vpi
->SrcReg
[0].File
== RC_FILE_TEMPORARY
&&
302 vpi
->SrcReg
[1].File
== RC_FILE_TEMPORARY
&&
303 vpi
->SrcReg
[2].File
== RC_FILE_TEMPORARY
&&
304 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[1].Index
&&
305 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[2].Index
&&
306 vpi
->SrcReg
[1].Index
!= vpi
->SrcReg
[2].Index
) {
307 inst
[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD
,
310 t_dst_index(vp
, &vpi
->DstReg
),
311 t_dst_mask(vpi
->DstReg
.WriteMask
),
312 t_dst_class(vpi
->DstReg
.File
));
314 inst
[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD
,
317 t_dst_index(vp
, &vpi
->DstReg
),
318 t_dst_mask(vpi
->DstReg
.WriteMask
),
319 t_dst_class(vpi
->DstReg
.File
));
321 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
322 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
323 inst
[3] = t_src(vp
, &vpi
->SrcReg
[2]);
326 static void ei_pow(struct r300_vertex_program_code
*vp
,
327 struct rc_sub_instruction
*vpi
,
330 inst
[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF
,
333 t_dst_index(vp
, &vpi
->DstReg
),
334 t_dst_mask(vpi
->DstReg
.WriteMask
),
335 t_dst_class(vpi
->DstReg
.File
));
336 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
337 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
338 inst
[3] = t_src_scalar(vp
, &vpi
->SrcReg
[1]);
341 static void mark_write(void * userdata
, struct rc_instruction
* inst
,
342 rc_register_file file
, unsigned int index
, unsigned int mask
)
344 unsigned int * writemasks
= userdata
;
346 if (file
!= RC_FILE_TEMPORARY
)
349 if (index
>= R300_VS_MAX_TEMPS
)
352 writemasks
[index
] |= mask
;
355 static unsigned long t_pred_src(struct r300_vertex_program_compiler
* compiler
)
357 return PVS_SRC_OPERAND(compiler
->PredicateIndex
,
358 t_swizzle(RC_SWIZZLE_ZERO
),
359 t_swizzle(RC_SWIZZLE_ZERO
),
360 t_swizzle(RC_SWIZZLE_ZERO
),
361 t_swizzle(RC_SWIZZLE_W
),
362 t_src_class(RC_FILE_TEMPORARY
),
366 static unsigned long t_pred_dst(struct r300_vertex_program_compiler
* compiler
,
367 unsigned int hw_opcode
, int is_math
)
369 return PVS_OP_DST_OPERAND(hw_opcode
,
372 compiler
->PredicateIndex
,
374 t_dst_class(RC_FILE_TEMPORARY
));
378 static void ei_if(struct r300_vertex_program_compiler
* compiler
,
379 struct rc_instruction
*rci
,
381 unsigned int branch_depth
)
383 unsigned int predicate_opcode
;
386 if (!compiler
->Base
.is_r500
) {
387 rc_error(&compiler
->Base
,"Opcode IF not supported\n");
391 /* Reserve a temporary to use as our predicate stack counter, if we
392 * don't already have one. */
393 if (!compiler
->PredicateMask
) {
394 unsigned int writemasks
[RC_REGISTER_MAX_INDEX
];
395 struct rc_instruction
* inst
;
397 memset(writemasks
, 0, sizeof(writemasks
));
398 for(inst
= compiler
->Base
.Program
.Instructions
.Next
;
399 inst
!= &compiler
->Base
.Program
.Instructions
;
401 rc_for_all_writes_mask(inst
, mark_write
, writemasks
);
403 for(i
= 0; i
< compiler
->Base
.max_temp_regs
; i
++) {
404 unsigned int mask
= ~writemasks
[i
] & RC_MASK_XYZW
;
405 /* Only the W component can be used fo the predicate
407 if (mask
& RC_MASK_W
) {
408 compiler
->PredicateMask
= RC_MASK_W
;
409 compiler
->PredicateIndex
= i
;
413 if (i
== compiler
->Base
.max_temp_regs
) {
414 rc_error(&compiler
->Base
, "No free temporary to use for"
415 " predicate stack counter.\n");
420 branch_depth
? VE_PRED_SET_NEQ_PUSH
: ME_PRED_SET_NEQ
;
422 rci
->U
.I
.SrcReg
[0].Swizzle
= RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci
->U
.I
.SrcReg
[0].Swizzle
,0));
423 if (branch_depth
== 0) {
425 predicate_opcode
= ME_PRED_SET_NEQ
;
426 inst
[1] = t_src(compiler
->code
, &rci
->U
.I
.SrcReg
[0]);
429 predicate_opcode
= VE_PRED_SET_NEQ_PUSH
;
430 inst
[1] = t_pred_src(compiler
);
431 inst
[2] = t_src(compiler
->code
, &rci
->U
.I
.SrcReg
[0]);
434 inst
[0] = t_pred_dst(compiler
, predicate_opcode
, is_math
);
439 static void ei_else(struct r300_vertex_program_compiler
* compiler
,
442 if (!compiler
->Base
.is_r500
) {
443 rc_error(&compiler
->Base
,"Opcode ELSE not supported\n");
446 inst
[0] = t_pred_dst(compiler
, ME_PRED_SET_INV
, 1);
447 inst
[1] = t_pred_src(compiler
);
452 static void ei_endif(struct r300_vertex_program_compiler
*compiler
,
455 if (!compiler
->Base
.is_r500
) {
456 rc_error(&compiler
->Base
,"Opcode ENDIF not supported\n");
459 inst
[0] = t_pred_dst(compiler
, ME_PRED_SET_POP
, 1);
460 inst
[1] = t_pred_src(compiler
);
465 static void translate_vertex_program(struct radeon_compiler
*c
, void *user
)
467 struct r300_vertex_program_compiler
*compiler
= (struct r300_vertex_program_compiler
*)c
;
468 struct rc_instruction
*rci
;
470 struct loop
* loops
= NULL
;
471 int current_loop_depth
= 0;
472 int loops_reserved
= 0;
474 unsigned int branch_depth
= 0;
476 compiler
->code
->pos_end
= 0; /* Not supported yet */
477 compiler
->code
->length
= 0;
478 compiler
->code
->num_temporaries
= 0;
480 compiler
->SetHwInputOutput(compiler
);
482 for(rci
= compiler
->Base
.Program
.Instructions
.Next
; rci
!= &compiler
->Base
.Program
.Instructions
; rci
= rci
->Next
) {
483 struct rc_sub_instruction
*vpi
= &rci
->U
.I
;
484 unsigned int *inst
= compiler
->code
->body
.d
+ compiler
->code
->length
;
485 const struct rc_opcode_info
*info
= rc_get_opcode_info(vpi
->Opcode
);
487 /* Skip instructions writing to non-existing destination */
488 if (!valid_dst(compiler
->code
, &vpi
->DstReg
))
491 if (info
->HasDstReg
) {
492 /* Relative addressing of destination operands is not supported yet. */
493 if (vpi
->DstReg
.RelAddr
) {
494 rc_error(&compiler
->Base
, "Vertex program does not support relative "
495 "addressing of destination operands (yet).\n");
499 /* Neither is Saturate. */
500 if (vpi
->SaturateMode
!= RC_SATURATE_NONE
) {
501 rc_error(&compiler
->Base
, "Vertex program does not support the Saturate "
502 "modifier (yet).\n");
506 if (compiler
->code
->length
>= c
->max_alu_insts
* 4) {
507 rc_error(&compiler
->Base
, "Vertex program has too many instructions\n");
511 assert(compiler
->Base
.is_r500
||
512 (vpi
->Opcode
!= RC_OPCODE_SEQ
&&
513 vpi
->Opcode
!= RC_OPCODE_SNE
));
515 switch (vpi
->Opcode
) {
516 case RC_OPCODE_ADD
: ei_vector2(compiler
->code
, VE_ADD
, vpi
, inst
); break;
517 case RC_OPCODE_ARL
: ei_vector1(compiler
->code
, VE_FLT2FIX_DX
, vpi
, inst
); break;
518 case RC_OPCODE_COS
: ei_math1(compiler
->code
, ME_COS
, vpi
, inst
); break;
519 case RC_OPCODE_DP4
: ei_vector2(compiler
->code
, VE_DOT_PRODUCT
, vpi
, inst
); break;
520 case RC_OPCODE_DST
: ei_vector2(compiler
->code
, VE_DISTANCE_VECTOR
, vpi
, inst
); break;
521 case RC_OPCODE_ELSE
: ei_else(compiler
, inst
); break;
522 case RC_OPCODE_ENDIF
: ei_endif(compiler
, inst
); branch_depth
--; break;
523 case RC_OPCODE_EX2
: ei_math1(compiler
->code
, ME_EXP_BASE2_FULL_DX
, vpi
, inst
); break;
524 case RC_OPCODE_EXP
: ei_math1(compiler
->code
, ME_EXP_BASE2_DX
, vpi
, inst
); break;
525 case RC_OPCODE_FRC
: ei_vector1(compiler
->code
, VE_FRACTION
, vpi
, inst
); break;
526 case RC_OPCODE_IF
: ei_if(compiler
, rci
, inst
, branch_depth
); branch_depth
++; break;
527 case RC_OPCODE_LG2
: ei_math1(compiler
->code
, ME_LOG_BASE2_FULL_DX
, vpi
, inst
); break;
528 case RC_OPCODE_LIT
: ei_lit(compiler
->code
, vpi
, inst
); break;
529 case RC_OPCODE_LOG
: ei_math1(compiler
->code
, ME_LOG_BASE2_DX
, vpi
, inst
); break;
530 case RC_OPCODE_MAD
: ei_mad(compiler
->code
, vpi
, inst
); break;
531 case RC_OPCODE_MAX
: ei_vector2(compiler
->code
, VE_MAXIMUM
, vpi
, inst
); break;
532 case RC_OPCODE_MIN
: ei_vector2(compiler
->code
, VE_MINIMUM
, vpi
, inst
); break;
533 case RC_OPCODE_MOV
: ei_vector1(compiler
->code
, VE_ADD
, vpi
, inst
); break;
534 case RC_OPCODE_MUL
: ei_vector2(compiler
->code
, VE_MULTIPLY
, vpi
, inst
); break;
535 case RC_OPCODE_POW
: ei_pow(compiler
->code
, vpi
, inst
); break;
536 case RC_OPCODE_RCP
: ei_math1(compiler
->code
, ME_RECIP_DX
, vpi
, inst
); break;
537 case RC_OPCODE_RSQ
: ei_math1(compiler
->code
, ME_RECIP_SQRT_DX
, vpi
, inst
); break;
538 case RC_OPCODE_SEQ
: ei_vector2(compiler
->code
, VE_SET_EQUAL
, vpi
, inst
); break;
539 case RC_OPCODE_SGE
: ei_vector2(compiler
->code
, VE_SET_GREATER_THAN_EQUAL
, vpi
, inst
); break;
540 case RC_OPCODE_SIN
: ei_math1(compiler
->code
, ME_SIN
, vpi
, inst
); break;
541 case RC_OPCODE_SLT
: ei_vector2(compiler
->code
, VE_SET_LESS_THAN
, vpi
, inst
); break;
542 case RC_OPCODE_SNE
: ei_vector2(compiler
->code
, VE_SET_NOT_EQUAL
, vpi
, inst
); break;
543 case RC_OPCODE_BGNLOOP
:
547 if ((!compiler
->Base
.is_r500
548 && loops_reserved
>= R300_VS_MAX_LOOP_DEPTH
)
549 || loops_reserved
>= R500_VS_MAX_FC_DEPTH
) {
550 rc_error(&compiler
->Base
,
551 "Loops are nested too deep.");
554 memory_pool_array_reserve(&compiler
->Base
.Pool
,
555 struct loop
, loops
, current_loop_depth
,
557 l
= &loops
[current_loop_depth
++];
558 memset(l
, 0, sizeof(struct loop
));
559 l
->BgnLoop
= (compiler
->code
->length
/ 4);
562 case RC_OPCODE_ENDLOOP
:
565 unsigned int act_addr
;
566 unsigned int last_addr
;
567 unsigned int ret_addr
;
570 l
= &loops
[current_loop_depth
- 1];
571 act_addr
= l
->BgnLoop
- 1;
572 last_addr
= (compiler
->code
->length
/ 4) - 1;
573 ret_addr
= l
->BgnLoop
;
575 if (loops_reserved
>= R300_VS_MAX_FC_OPS
) {
576 rc_error(&compiler
->Base
,
577 "Too many flow control instructions.");
580 if (compiler
->Base
.is_r500
) {
581 compiler
->code
->fc_op_addrs
.r500
582 [compiler
->code
->num_fc_ops
].lw
=
583 R500_PVS_FC_ACT_ADRS(act_addr
)
584 | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
586 compiler
->code
->fc_op_addrs
.r500
587 [compiler
->code
->num_fc_ops
].uw
=
588 R500_PVS_FC_LAST_INST(last_addr
)
589 | R500_PVS_FC_RTN_INST(ret_addr
)
592 compiler
->code
->fc_op_addrs
.r300
593 [compiler
->code
->num_fc_ops
] =
594 R300_PVS_FC_ACT_ADRS(act_addr
)
595 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
596 | R300_PVS_FC_LAST_INST(last_addr
)
597 | R300_PVS_FC_RTN_INST(ret_addr
)
600 compiler
->code
->fc_loop_index
[compiler
->code
->num_fc_ops
] =
601 R300_PVS_FC_LOOP_INIT_VAL(0x0)
602 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
604 compiler
->code
->fc_ops
|= R300_VAP_PVS_FC_OPC_LOOP(
605 compiler
->code
->num_fc_ops
);
606 compiler
->code
->num_fc_ops
++;
607 current_loop_depth
--;
612 rc_error(&compiler
->Base
, "Unknown opcode %s\n", info
->Name
);
616 /* Non-flow control instructions that are inside an if statement
617 * need to pay attention to the predicate bit. */
619 && vpi
->Opcode
!= RC_OPCODE_IF
620 && vpi
->Opcode
!= RC_OPCODE_ELSE
621 && vpi
->Opcode
!= RC_OPCODE_ENDIF
) {
623 inst
[0] |= (PVS_DST_PRED_ENABLE_MASK
624 << PVS_DST_PRED_ENABLE_SHIFT
);
625 inst
[0] |= (PVS_DST_PRED_SENSE_MASK
626 << PVS_DST_PRED_SENSE_SHIFT
);
629 /* Update the number of temporaries. */
630 if (info
->HasDstReg
&& vpi
->DstReg
.File
== RC_FILE_TEMPORARY
&&
631 vpi
->DstReg
.Index
>= compiler
->code
->num_temporaries
)
632 compiler
->code
->num_temporaries
= vpi
->DstReg
.Index
+ 1;
634 for (unsigned i
= 0; i
< info
->NumSrcRegs
; i
++)
635 if (vpi
->SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
636 vpi
->SrcReg
[i
].Index
>= compiler
->code
->num_temporaries
)
637 compiler
->code
->num_temporaries
= vpi
->SrcReg
[i
].Index
+ 1;
639 if (compiler
->PredicateMask
)
640 if (compiler
->PredicateIndex
>= compiler
->code
->num_temporaries
)
641 compiler
->code
->num_temporaries
= compiler
->PredicateIndex
+ 1;
643 if (compiler
->code
->num_temporaries
> compiler
->Base
.max_temp_regs
) {
644 rc_error(&compiler
->Base
, "Too many temporaries.\n");
648 compiler
->code
->length
+= 4;
650 if (compiler
->Base
.Error
)
655 struct temporary_allocation
{
656 unsigned int Allocated
:1;
657 unsigned int HwTemp
:15;
658 struct rc_instruction
* LastRead
;
661 static void allocate_temporary_registers(struct radeon_compiler
*c
, void *user
)
663 struct r300_vertex_program_compiler
*compiler
= (struct r300_vertex_program_compiler
*)c
;
664 struct rc_instruction
*inst
;
665 struct rc_instruction
*end_loop
= NULL
;
666 unsigned int num_orig_temps
= 0;
667 char hwtemps
[RC_REGISTER_MAX_INDEX
];
668 struct temporary_allocation
* ta
;
670 struct rc_instruction
*last_inst_src_reladdr
= NULL
;
672 memset(hwtemps
, 0, sizeof(hwtemps
));
676 /* Pass 1: Count original temporaries. */
677 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
678 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
680 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
681 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
682 if (inst
->U
.I
.SrcReg
[i
].Index
>= num_orig_temps
)
683 num_orig_temps
= inst
->U
.I
.SrcReg
[i
].Index
+ 1;
687 if (opcode
->HasDstReg
) {
688 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
689 if (inst
->U
.I
.DstReg
.Index
>= num_orig_temps
)
690 num_orig_temps
= inst
->U
.I
.DstReg
.Index
+ 1;
695 /* Pass 2: If there is relative addressing of dst temporaries, we cannot change register indices. Give up.
696 * For src temporaries, save the last instruction which uses relative addressing. */
697 for (inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
698 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
700 if (opcode
->HasDstReg
)
701 if (inst
->U
.I
.DstReg
.RelAddr
)
704 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
705 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
706 inst
->U
.I
.SrcReg
[i
].RelAddr
) {
707 last_inst_src_reladdr
= inst
;
712 ta
= (struct temporary_allocation
*)memory_pool_malloc(&compiler
->Base
.Pool
,
713 sizeof(struct temporary_allocation
) * num_orig_temps
);
714 memset(ta
, 0, sizeof(struct temporary_allocation
) * num_orig_temps
);
716 /* Pass 3: Determine original temporary lifetimes */
717 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
718 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
719 /* Instructions inside of loops need to use the ENDLOOP
720 * instruction as their LastRead. */
721 if (!end_loop
&& inst
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
723 struct rc_instruction
* ptr
;
724 for(ptr
= inst
->Next
;
725 ptr
!= &compiler
->Base
.Program
.Instructions
;
727 if (ptr
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
729 } else if (ptr
->U
.I
.Opcode
== RC_OPCODE_ENDLOOP
) {
739 if (inst
== end_loop
) {
744 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
745 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
746 struct rc_instruction
*last_read
;
748 /* From "last_inst_src_reladdr", "end_loop", and "inst",
749 * select the instruction with the highest instruction index (IP).
750 * Note that "end_loop", if available, has always a higher index than "inst". */
751 if (last_inst_src_reladdr
) {
753 last_read
= last_inst_src_reladdr
->IP
> end_loop
->IP
?
754 last_inst_src_reladdr
: end_loop
;
756 last_read
= last_inst_src_reladdr
->IP
> inst
->IP
?
757 last_inst_src_reladdr
: inst
;
760 last_read
= end_loop
? end_loop
: inst
;
763 ta
[inst
->U
.I
.SrcReg
[i
].Index
].LastRead
= last_read
;
768 /* Pass 4: Register allocation */
769 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
770 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
772 if (!last_inst_src_reladdr
|| last_inst_src_reladdr
->IP
< inst
->IP
) {
773 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
774 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
775 unsigned int orig
= inst
->U
.I
.SrcReg
[i
].Index
;
776 inst
->U
.I
.SrcReg
[i
].Index
= ta
[orig
].HwTemp
;
778 if (ta
[orig
].Allocated
&& inst
== ta
[orig
].LastRead
)
779 hwtemps
[ta
[orig
].HwTemp
] = 0;
784 if (opcode
->HasDstReg
) {
785 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
786 unsigned int orig
= inst
->U
.I
.DstReg
.Index
;
788 if (!ta
[orig
].Allocated
) {
789 for(j
= 0; j
< c
->max_temp_regs
; ++j
) {
793 if (j
>= c
->max_temp_regs
) {
794 rc_error(c
, "Too many temporaries\n");
797 ta
[orig
].Allocated
= 1;
798 if (last_inst_src_reladdr
&&
799 last_inst_src_reladdr
->IP
> inst
->IP
) {
800 ta
[orig
].HwTemp
= orig
;
804 hwtemps
[ta
[orig
].HwTemp
] = 1;
808 inst
->U
.I
.DstReg
.Index
= ta
[orig
].HwTemp
;
815 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
816 * and the Saturate opcode modifier. Only Absolute is currently transformed.
818 static int transform_nonnative_modifiers(
819 struct radeon_compiler
*c
,
820 struct rc_instruction
*inst
,
823 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
826 /* Transform ABS(a) to MAX(a, -a). */
827 for (i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
828 if (inst
->U
.I
.SrcReg
[i
].Abs
) {
829 struct rc_instruction
*new_inst
;
832 inst
->U
.I
.SrcReg
[i
].Abs
= 0;
834 temp
= rc_find_free_temporary(c
);
836 new_inst
= rc_insert_new_instruction(c
, inst
->Prev
);
837 new_inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
838 new_inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
839 new_inst
->U
.I
.DstReg
.Index
= temp
;
840 new_inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[i
];
841 new_inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[i
];
842 new_inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
844 memset(&inst
->U
.I
.SrcReg
[i
], 0, sizeof(inst
->U
.I
.SrcReg
[i
]));
845 inst
->U
.I
.SrcReg
[i
].File
= RC_FILE_TEMPORARY
;
846 inst
->U
.I
.SrcReg
[i
].Index
= temp
;
847 inst
->U
.I
.SrcReg
[i
].Swizzle
= RC_SWIZZLE_XYZW
;
854 * Vertex engine cannot read two inputs or two constants at the same time.
855 * Introduce intermediate MOVs to temporary registers to account for this.
857 static int transform_source_conflicts(
858 struct radeon_compiler
*c
,
859 struct rc_instruction
* inst
,
862 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
864 if (opcode
->NumSrcRegs
== 3) {
865 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2])
866 || t_src_conflict(inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2])) {
867 int tmpreg
= rc_find_free_temporary(c
);
868 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
869 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
870 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
871 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
872 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
874 reset_srcreg(&inst
->U
.I
.SrcReg
[2]);
875 inst
->U
.I
.SrcReg
[2].File
= RC_FILE_TEMPORARY
;
876 inst
->U
.I
.SrcReg
[2].Index
= tmpreg
;
880 if (opcode
->NumSrcRegs
>= 2) {
881 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[0])) {
882 int tmpreg
= rc_find_free_temporary(c
);
883 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
884 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
885 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
886 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
887 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
889 reset_srcreg(&inst
->U
.I
.SrcReg
[1]);
890 inst
->U
.I
.SrcReg
[1].File
= RC_FILE_TEMPORARY
;
891 inst
->U
.I
.SrcReg
[1].Index
= tmpreg
;
898 static void rc_vs_add_artificial_outputs(struct radeon_compiler
*c
, void *user
)
900 struct r300_vertex_program_compiler
* compiler
= (struct r300_vertex_program_compiler
*)c
;
903 for(i
= 0; i
< 32; ++i
) {
904 if ((compiler
->RequiredOutputs
& (1 << i
)) &&
905 !(compiler
->Base
.Program
.OutputsWritten
& (1 << i
))) {
906 struct rc_instruction
* inst
= rc_insert_new_instruction(&compiler
->Base
, compiler
->Base
.Program
.Instructions
.Prev
);
907 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
909 inst
->U
.I
.DstReg
.File
= RC_FILE_OUTPUT
;
910 inst
->U
.I
.DstReg
.Index
= i
;
911 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
913 inst
->U
.I
.SrcReg
[0].File
= RC_FILE_CONSTANT
;
914 inst
->U
.I
.SrcReg
[0].Index
= 0;
915 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XYZW
;
917 compiler
->Base
.Program
.OutputsWritten
|= 1 << i
;
922 static void dataflow_outputs_mark_used(void * userdata
, void * data
,
923 void (*callback
)(void *, unsigned int, unsigned int))
925 struct r300_vertex_program_compiler
* c
= userdata
;
928 for(i
= 0; i
< 32; ++i
) {
929 if (c
->RequiredOutputs
& (1 << i
))
930 callback(data
, i
, RC_MASK_XYZW
);
934 static int swizzle_is_native(rc_opcode opcode
, struct rc_src_register reg
)
942 static void transform_negative_addressing(struct r300_vertex_program_compiler
*c
,
943 struct rc_instruction
*arl
,
944 struct rc_instruction
*end
,
947 struct rc_instruction
*inst
, *add
;
948 unsigned const_swizzle
;
951 add
= rc_insert_new_instruction(&c
->Base
, arl
->Prev
);
952 add
->U
.I
.Opcode
= RC_OPCODE_ADD
;
953 add
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
954 add
->U
.I
.DstReg
.Index
= rc_find_free_temporary(&c
->Base
);
955 add
->U
.I
.DstReg
.WriteMask
= RC_MASK_X
;
956 add
->U
.I
.SrcReg
[0] = arl
->U
.I
.SrcReg
[0];
957 add
->U
.I
.SrcReg
[1].File
= RC_FILE_CONSTANT
;
958 add
->U
.I
.SrcReg
[1].Index
= rc_constants_add_immediate_scalar(&c
->Base
.Program
.Constants
,
959 min_offset
, &const_swizzle
);
960 add
->U
.I
.SrcReg
[1].Swizzle
= const_swizzle
;
962 arl
->U
.I
.SrcReg
[0].File
= RC_FILE_TEMPORARY
;
963 arl
->U
.I
.SrcReg
[0].Index
= add
->U
.I
.DstReg
.Index
;
964 arl
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XXXX
;
966 /* Rewrite offsets up to and excluding inst. */
967 for (inst
= arl
->Next
; inst
!= end
; inst
= inst
->Next
) {
968 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
970 for (unsigned i
= 0; i
< opcode
->NumSrcRegs
; i
++)
971 if (inst
->U
.I
.SrcReg
[i
].RelAddr
)
972 inst
->U
.I
.SrcReg
[i
].Index
-= min_offset
;
976 static void rc_emulate_negative_addressing(struct radeon_compiler
*compiler
, void *user
)
978 struct r300_vertex_program_compiler
* c
= (struct r300_vertex_program_compiler
*)compiler
;
979 struct rc_instruction
*inst
, *lastARL
= NULL
;
982 for (inst
= c
->Base
.Program
.Instructions
.Next
; inst
!= &c
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
983 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
985 if (inst
->U
.I
.Opcode
== RC_OPCODE_ARL
) {
986 if (lastARL
!= NULL
&& min_offset
< 0)
987 transform_negative_addressing(c
, lastARL
, inst
, min_offset
);
994 for (unsigned i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
995 if (inst
->U
.I
.SrcReg
[i
].RelAddr
&&
996 inst
->U
.I
.SrcReg
[i
].Index
< 0) {
997 /* ARL must precede any indirect addressing. */
998 if (lastARL
== NULL
) {
999 rc_error(&c
->Base
, "Vertex shader: Found relative addressing without ARL.");
1003 if (inst
->U
.I
.SrcReg
[i
].Index
< min_offset
)
1004 min_offset
= inst
->U
.I
.SrcReg
[i
].Index
;
1009 if (lastARL
!= NULL
&& min_offset
< 0)
1010 transform_negative_addressing(c
, lastARL
, inst
, min_offset
);
1013 static struct rc_swizzle_caps r300_vertprog_swizzle_caps
= {
1014 .IsNative
= &swizzle_is_native
,
1015 .Split
= 0 /* should never be called */
1018 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler
*c
)
1020 int is_r500
= c
->Base
.is_r500
;
1021 int kill_consts
= c
->Base
.remove_unused_constants
;
1022 int opt
= !c
->Base
.disable_optimizations
;
1024 /* Lists of instruction transformations. */
1025 struct radeon_program_transformation alu_rewrite_r500
[] = {
1026 { &r300_transform_vertex_alu
, 0 },
1027 { &r300_transform_trig_scale_vertex
, 0 },
1031 struct radeon_program_transformation alu_rewrite_r300
[] = {
1032 { &r300_transform_vertex_alu
, 0 },
1033 { &r300_transform_trig_simple
, 0 },
1037 /* Note: These passes have to be done seperately from ALU rewrite,
1038 * otherwise non-native ALU instructions with source conflits
1039 * or non-native modifiers will not be treated properly.
1041 struct radeon_program_transformation emulate_modifiers
[] = {
1042 { &transform_nonnative_modifiers
, 0 },
1046 struct radeon_program_transformation resolve_src_conflicts
[] = {
1047 { &transform_source_conflicts
, 0 },
1051 /* List of compiler passes. */
1052 struct radeon_compiler_pass vs_list
[] = {
1053 /* NAME DUMP PREDICATE FUNCTION PARAM */
1054 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs
, NULL
},
1055 {"transform loops", 1, 1, rc_transform_loops
, NULL
},
1056 {"emulate branches", 1, !is_r500
, rc_emulate_branches
, NULL
},
1057 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing
, NULL
},
1058 {"native rewrite", 1, is_r500
, rc_local_transform
, alu_rewrite_r500
},
1059 {"native rewrite", 1, !is_r500
, rc_local_transform
, alu_rewrite_r300
},
1060 {"emulate modifiers", 1, !is_r500
, rc_local_transform
, emulate_modifiers
},
1061 {"deadcode", 1, opt
, rc_dataflow_deadcode
, dataflow_outputs_mark_used
},
1062 {"dataflow optimize", 1, opt
, rc_optimize
, NULL
},
1063 /* This pass must be done after optimizations. */
1064 {"source conflict resolve", 1, 1, rc_local_transform
, resolve_src_conflicts
},
1065 {"dataflow swizzles", 1, 1, rc_dataflow_swizzles
, NULL
},
1066 {"register allocation", 1, opt
, allocate_temporary_registers
, NULL
},
1067 {"dead constants", 1, kill_consts
, rc_remove_unused_constants
, &c
->code
->constants_remap_table
},
1068 {"final code validation", 0, 1, rc_validate_final_shader
, NULL
},
1069 {"machine code generation", 0, 1, translate_vertex_program
, NULL
},
1070 {"dump machine code", 0, c
->Base
.Debug
& RC_DBG_LOG
, r300_vertex_program_dump
, NULL
},
1071 {NULL
, 0, 0, NULL
, NULL
}
1074 c
->Base
.SwizzleCaps
= &r300_vertprog_swizzle_caps
;
1076 rc_run_compiler(&c
->Base
, vs_list
, "Vertex Program");
1078 c
->code
->InputsRead
= c
->Base
.Program
.InputsRead
;
1079 c
->code
->OutputsWritten
= c
->Base
.Program
.OutputsWritten
;
1080 rc_constants_copy(&c
->code
->constants
, &c
->Base
.Program
.Constants
);