2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23 #include "radeon_compiler.h"
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program.h"
32 #include "radeon_program_alu.h"
33 #include "radeon_swizzle.h"
34 #include "radeon_emulate_branches.h"
35 #include "radeon_emulate_loops.h"
36 #include "radeon_remove_constants.h"
39 * Take an already-setup and valid source then swizzle it appropriately to
40 * obtain a constant ZERO or ONE source.
42 #define __CONST(x, y) \
43 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
48 t_src_class(vpi->SrcReg[x].File), \
49 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
52 static unsigned long t_dst_mask(unsigned int mask
)
54 /* RC_MASK_* is equivalent to VSF_FLAG_* */
55 return mask
& RC_MASK_XYZW
;
58 static unsigned long t_dst_class(rc_register_file file
)
62 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
64 case RC_FILE_TEMPORARY
:
65 return PVS_DST_REG_TEMPORARY
;
67 return PVS_DST_REG_OUT
;
69 return PVS_DST_REG_A0
;
73 static unsigned long t_dst_index(struct r300_vertex_program_code
*vp
,
74 struct rc_dst_register
*dst
)
76 if (dst
->File
== RC_FILE_OUTPUT
)
77 return vp
->outputs
[dst
->Index
];
82 static unsigned long t_src_class(rc_register_file file
)
86 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
89 case RC_FILE_TEMPORARY
:
90 return PVS_SRC_REG_TEMPORARY
;
92 return PVS_SRC_REG_INPUT
;
93 case RC_FILE_CONSTANT
:
94 return PVS_SRC_REG_CONSTANT
;
98 static int t_src_conflict(struct rc_src_register a
, struct rc_src_register b
)
100 unsigned long aclass
= t_src_class(a
.File
);
101 unsigned long bclass
= t_src_class(b
.File
);
103 if (aclass
!= bclass
)
105 if (aclass
== PVS_SRC_REG_TEMPORARY
)
108 if (a
.RelAddr
|| b
.RelAddr
)
110 if (a
.Index
!= b
.Index
)
116 static inline unsigned long t_swizzle(unsigned int swizzle
)
118 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 static unsigned long t_src_index(struct r300_vertex_program_code
*vp
,
123 struct rc_src_register
*src
)
125 if (src
->File
== RC_FILE_INPUT
) {
126 assert(vp
->inputs
[src
->Index
] != -1);
127 return vp
->inputs
[src
->Index
];
129 if (src
->Index
< 0) {
131 "negative offsets for indirect addressing do not work.\n");
138 /* these two functions should probably be merged... */
140 static unsigned long t_src(struct r300_vertex_program_code
*vp
,
141 struct rc_src_register
*src
)
143 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
144 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
146 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
147 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
148 t_swizzle(GET_SWZ(src
->Swizzle
, 1)),
149 t_swizzle(GET_SWZ(src
->Swizzle
, 2)),
150 t_swizzle(GET_SWZ(src
->Swizzle
, 3)),
151 t_src_class(src
->File
),
153 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
156 static unsigned long t_src_scalar(struct r300_vertex_program_code
*vp
,
157 struct rc_src_register
*src
)
159 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
160 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
162 unsigned int swz
= rc_get_scalar_src_swz(src
->Swizzle
);
164 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
169 t_src_class(src
->File
),
170 src
->Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
171 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
174 static int valid_dst(struct r300_vertex_program_code
*vp
,
175 struct rc_dst_register
*dst
)
177 if (dst
->File
== RC_FILE_OUTPUT
&& vp
->outputs
[dst
->Index
] == -1) {
179 } else if (dst
->File
== RC_FILE_ADDRESS
) {
180 assert(dst
->Index
== 0);
186 static void ei_vector1(struct r300_vertex_program_code
*vp
,
187 unsigned int hw_opcode
,
188 struct rc_sub_instruction
*vpi
,
191 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
194 t_dst_index(vp
, &vpi
->DstReg
),
195 t_dst_mask(vpi
->DstReg
.WriteMask
),
196 t_dst_class(vpi
->DstReg
.File
),
197 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
198 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
199 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
200 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
203 static void ei_vector2(struct r300_vertex_program_code
*vp
,
204 unsigned int hw_opcode
,
205 struct rc_sub_instruction
*vpi
,
208 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
211 t_dst_index(vp
, &vpi
->DstReg
),
212 t_dst_mask(vpi
->DstReg
.WriteMask
),
213 t_dst_class(vpi
->DstReg
.File
),
214 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
215 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
216 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
217 inst
[3] = __CONST(1, RC_SWIZZLE_ZERO
);
220 static void ei_math1(struct r300_vertex_program_code
*vp
,
221 unsigned int hw_opcode
,
222 struct rc_sub_instruction
*vpi
,
225 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
228 t_dst_index(vp
, &vpi
->DstReg
),
229 t_dst_mask(vpi
->DstReg
.WriteMask
),
230 t_dst_class(vpi
->DstReg
.File
),
231 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
232 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
233 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
234 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
237 static void ei_lit(struct r300_vertex_program_code
*vp
,
238 struct rc_sub_instruction
*vpi
,
241 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
243 inst
[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX
,
246 t_dst_index(vp
, &vpi
->DstReg
),
247 t_dst_mask(vpi
->DstReg
.WriteMask
),
248 t_dst_class(vpi
->DstReg
.File
),
249 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
250 /* NOTE: Users swizzling might not work. */
251 inst
[1] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
252 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
253 PVS_SRC_SELECT_FORCE_0
, // Z
254 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
255 t_src_class(vpi
->SrcReg
[0].File
),
256 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
257 (vpi
->SrcReg
[0].RelAddr
<< 4);
258 inst
[2] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
259 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
260 PVS_SRC_SELECT_FORCE_0
, // Z
261 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
262 t_src_class(vpi
->SrcReg
[0].File
),
263 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
264 (vpi
->SrcReg
[0].RelAddr
<< 4);
265 inst
[3] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
266 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
267 PVS_SRC_SELECT_FORCE_0
, // Z
268 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
269 t_src_class(vpi
->SrcReg
[0].File
),
270 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
271 (vpi
->SrcReg
[0].RelAddr
<< 4);
274 static void ei_mad(struct r300_vertex_program_code
*vp
,
275 struct rc_sub_instruction
*vpi
,
279 /* Remarks about hardware limitations of MAD
280 * (please preserve this comment, as this information is _NOT_
281 * in the documentation provided by AMD).
283 * As described in the documentation, MAD with three unique temporary
284 * source registers requires the use of the macro version.
286 * However (and this is not mentioned in the documentation), apparently
287 * the macro version is _NOT_ a full superset of the normal version.
288 * In particular, the macro version does not always work when relative
289 * addressing is used in the source operands.
291 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
292 * assembly shader path when using medium quality animations
293 * (i.e. animations with matrix blending instead of quaternion blending).
295 * Unfortunately, I (nha) have been unable to extract a Piglit regression
296 * test for this issue - for some reason, it is possible to have vertex
297 * programs whose prefix is *exactly* the same as the prefix of the
298 * offending program in Sauerbraten up to the offending instruction
299 * without causing any trouble.
301 * Bottom line: Only use the macro version only when really necessary;
302 * according to AMD docs, this should improve performance by one clock
303 * as a nice side bonus.
305 if (vpi
->SrcReg
[0].File
== RC_FILE_TEMPORARY
&&
306 vpi
->SrcReg
[1].File
== RC_FILE_TEMPORARY
&&
307 vpi
->SrcReg
[2].File
== RC_FILE_TEMPORARY
&&
308 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[1].Index
&&
309 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[2].Index
&&
310 vpi
->SrcReg
[1].Index
!= vpi
->SrcReg
[2].Index
) {
311 inst
[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD
,
314 t_dst_index(vp
, &vpi
->DstReg
),
315 t_dst_mask(vpi
->DstReg
.WriteMask
),
316 t_dst_class(vpi
->DstReg
.File
),
317 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
319 inst
[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD
,
322 t_dst_index(vp
, &vpi
->DstReg
),
323 t_dst_mask(vpi
->DstReg
.WriteMask
),
324 t_dst_class(vpi
->DstReg
.File
),
325 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
327 /* Arguments with constant swizzles still count as a unique
328 * temporary, so we should make sure these arguments share a
329 * register index with one of the other arguments. */
330 for (i
= 0; i
< 3; i
++) {
332 if (vpi
->SrcReg
[i
].File
!= RC_FILE_NONE
)
335 for (j
= 0; j
< 3; j
++) {
337 vpi
->SrcReg
[i
].Index
=
338 vpi
->SrcReg
[j
].Index
;
344 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
345 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
346 inst
[3] = t_src(vp
, &vpi
->SrcReg
[2]);
349 static void ei_pow(struct r300_vertex_program_code
*vp
,
350 struct rc_sub_instruction
*vpi
,
353 inst
[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF
,
356 t_dst_index(vp
, &vpi
->DstReg
),
357 t_dst_mask(vpi
->DstReg
.WriteMask
),
358 t_dst_class(vpi
->DstReg
.File
),
359 vpi
->SaturateMode
== RC_SATURATE_ZERO_ONE
);
360 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
361 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
362 inst
[3] = t_src_scalar(vp
, &vpi
->SrcReg
[1]);
365 static void translate_vertex_program(struct radeon_compiler
*c
, void *user
)
367 struct r300_vertex_program_compiler
*compiler
= (struct r300_vertex_program_compiler
*)c
;
368 struct rc_instruction
*rci
;
370 unsigned loops
[R500_PVS_MAX_LOOP_DEPTH
];
371 unsigned loop_depth
= 0;
373 compiler
->code
->pos_end
= 0; /* Not supported yet */
374 compiler
->code
->length
= 0;
375 compiler
->code
->num_temporaries
= 0;
377 compiler
->SetHwInputOutput(compiler
);
379 for(rci
= compiler
->Base
.Program
.Instructions
.Next
; rci
!= &compiler
->Base
.Program
.Instructions
; rci
= rci
->Next
) {
380 struct rc_sub_instruction
*vpi
= &rci
->U
.I
;
381 unsigned int *inst
= compiler
->code
->body
.d
+ compiler
->code
->length
;
382 const struct rc_opcode_info
*info
= rc_get_opcode_info(vpi
->Opcode
);
384 /* Skip instructions writing to non-existing destination */
385 if (!valid_dst(compiler
->code
, &vpi
->DstReg
))
388 if (info
->HasDstReg
) {
389 /* Neither is Saturate. */
390 if (vpi
->SaturateMode
!= RC_SATURATE_NONE
&& !c
->is_r500
) {
391 rc_error(&compiler
->Base
, "Vertex program does not support the Saturate "
392 "modifier (yet).\n");
396 if (compiler
->code
->length
>= c
->max_alu_insts
* 4) {
397 rc_error(&compiler
->Base
, "Vertex program has too many instructions\n");
401 assert(compiler
->Base
.is_r500
||
402 (vpi
->Opcode
!= RC_OPCODE_SEQ
&&
403 vpi
->Opcode
!= RC_OPCODE_SNE
));
405 switch (vpi
->Opcode
) {
406 case RC_OPCODE_ADD
: ei_vector2(compiler
->code
, VE_ADD
, vpi
, inst
); break;
407 case RC_OPCODE_ARL
: ei_vector1(compiler
->code
, VE_FLT2FIX_DX
, vpi
, inst
); break;
408 case RC_OPCODE_ARR
: ei_vector1(compiler
->code
, VE_FLT2FIX_DX_RND
, vpi
, inst
); break;
409 case RC_OPCODE_COS
: ei_math1(compiler
->code
, ME_COS
, vpi
, inst
); break;
410 case RC_OPCODE_DP4
: ei_vector2(compiler
->code
, VE_DOT_PRODUCT
, vpi
, inst
); break;
411 case RC_OPCODE_DST
: ei_vector2(compiler
->code
, VE_DISTANCE_VECTOR
, vpi
, inst
); break;
412 case RC_OPCODE_EX2
: ei_math1(compiler
->code
, ME_EXP_BASE2_FULL_DX
, vpi
, inst
); break;
413 case RC_OPCODE_EXP
: ei_math1(compiler
->code
, ME_EXP_BASE2_DX
, vpi
, inst
); break;
414 case RC_OPCODE_FRC
: ei_vector1(compiler
->code
, VE_FRACTION
, vpi
, inst
); break;
415 case RC_OPCODE_LG2
: ei_math1(compiler
->code
, ME_LOG_BASE2_FULL_DX
, vpi
, inst
); break;
416 case RC_OPCODE_LIT
: ei_lit(compiler
->code
, vpi
, inst
); break;
417 case RC_OPCODE_LOG
: ei_math1(compiler
->code
, ME_LOG_BASE2_DX
, vpi
, inst
); break;
418 case RC_OPCODE_MAD
: ei_mad(compiler
->code
, vpi
, inst
); break;
419 case RC_OPCODE_MAX
: ei_vector2(compiler
->code
, VE_MAXIMUM
, vpi
, inst
); break;
420 case RC_OPCODE_MIN
: ei_vector2(compiler
->code
, VE_MINIMUM
, vpi
, inst
); break;
421 case RC_OPCODE_MOV
: ei_vector1(compiler
->code
, VE_ADD
, vpi
, inst
); break;
422 case RC_OPCODE_MUL
: ei_vector2(compiler
->code
, VE_MULTIPLY
, vpi
, inst
); break;
423 case RC_OPCODE_POW
: ei_pow(compiler
->code
, vpi
, inst
); break;
424 case RC_OPCODE_RCP
: ei_math1(compiler
->code
, ME_RECIP_DX
, vpi
, inst
); break;
425 case RC_OPCODE_RSQ
: ei_math1(compiler
->code
, ME_RECIP_SQRT_DX
, vpi
, inst
); break;
426 case RC_OPCODE_SEQ
: ei_vector2(compiler
->code
, VE_SET_EQUAL
, vpi
, inst
); break;
427 case RC_OPCODE_SGE
: ei_vector2(compiler
->code
, VE_SET_GREATER_THAN_EQUAL
, vpi
, inst
); break;
428 case RC_OPCODE_SIN
: ei_math1(compiler
->code
, ME_SIN
, vpi
, inst
); break;
429 case RC_OPCODE_SLT
: ei_vector2(compiler
->code
, VE_SET_LESS_THAN
, vpi
, inst
); break;
430 case RC_OPCODE_SNE
: ei_vector2(compiler
->code
, VE_SET_NOT_EQUAL
, vpi
, inst
); break;
431 case RC_OPCODE_BGNLOOP
:
433 if ((!compiler
->Base
.is_r500
434 && loop_depth
>= R300_VS_MAX_LOOP_DEPTH
)
435 || loop_depth
>= R500_PVS_MAX_LOOP_DEPTH
) {
436 rc_error(&compiler
->Base
,
437 "Loops are nested too deep.");
440 loops
[loop_depth
++] = ((compiler
->code
->length
)/ 4) + 1;
443 case RC_OPCODE_ENDLOOP
:
445 unsigned int act_addr
;
446 unsigned int last_addr
;
447 unsigned int ret_addr
;
449 ret_addr
= loops
[--loop_depth
];
450 act_addr
= ret_addr
- 1;
451 last_addr
= (compiler
->code
->length
/ 4) - 1;
453 if (loop_depth
>= R300_VS_MAX_FC_OPS
) {
454 rc_error(&compiler
->Base
,
455 "Too many flow control instructions.");
458 if (compiler
->Base
.is_r500
) {
459 compiler
->code
->fc_op_addrs
.r500
460 [compiler
->code
->num_fc_ops
].lw
=
461 R500_PVS_FC_ACT_ADRS(act_addr
)
462 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
464 compiler
->code
->fc_op_addrs
.r500
465 [compiler
->code
->num_fc_ops
].uw
=
466 R500_PVS_FC_LAST_INST(last_addr
)
467 | R500_PVS_FC_RTN_INST(ret_addr
)
470 compiler
->code
->fc_op_addrs
.r300
471 [compiler
->code
->num_fc_ops
] =
472 R300_PVS_FC_ACT_ADRS(act_addr
)
473 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
474 | R300_PVS_FC_LAST_INST(last_addr
)
475 | R300_PVS_FC_RTN_INST(ret_addr
)
478 compiler
->code
->fc_loop_index
[compiler
->code
->num_fc_ops
] =
479 R300_PVS_FC_LOOP_INIT_VAL(0x0)
480 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
482 compiler
->code
->fc_ops
|= R300_VAP_PVS_FC_OPC_LOOP(
483 compiler
->code
->num_fc_ops
);
484 compiler
->code
->num_fc_ops
++;
489 case RC_ME_PRED_SET_CLR
:
490 ei_math1(compiler
->code
, ME_PRED_SET_CLR
, vpi
, inst
);
493 case RC_ME_PRED_SET_INV
:
494 ei_math1(compiler
->code
, ME_PRED_SET_INV
, vpi
, inst
);
497 case RC_ME_PRED_SET_POP
:
498 ei_math1(compiler
->code
, ME_PRED_SET_POP
, vpi
, inst
);
501 case RC_ME_PRED_SET_RESTORE
:
502 ei_math1(compiler
->code
, ME_PRED_SET_RESTORE
, vpi
, inst
);
506 ei_math1(compiler
->code
, ME_PRED_SET_EQ
, vpi
, inst
);
509 case RC_ME_PRED_SNEQ
:
510 ei_math1(compiler
->code
, ME_PRED_SET_NEQ
, vpi
, inst
);
513 case RC_VE_PRED_SNEQ_PUSH
:
514 ei_vector2(compiler
->code
, VE_PRED_SET_NEQ_PUSH
,
519 rc_error(&compiler
->Base
, "Unknown opcode %s\n", info
->Name
);
523 if (vpi
->DstReg
.Pred
!= RC_PRED_DISABLED
) {
524 inst
[0] |= (PVS_DST_PRED_ENABLE_MASK
525 << PVS_DST_PRED_ENABLE_SHIFT
);
526 if (vpi
->DstReg
.Pred
== RC_PRED_SET
) {
527 inst
[0] |= (PVS_DST_PRED_SENSE_MASK
528 << PVS_DST_PRED_SENSE_SHIFT
);
532 /* Update the number of temporaries. */
533 if (info
->HasDstReg
&& vpi
->DstReg
.File
== RC_FILE_TEMPORARY
&&
534 vpi
->DstReg
.Index
>= compiler
->code
->num_temporaries
)
535 compiler
->code
->num_temporaries
= vpi
->DstReg
.Index
+ 1;
537 for (unsigned i
= 0; i
< info
->NumSrcRegs
; i
++)
538 if (vpi
->SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
539 vpi
->SrcReg
[i
].Index
>= compiler
->code
->num_temporaries
)
540 compiler
->code
->num_temporaries
= vpi
->SrcReg
[i
].Index
+ 1;
542 if (compiler
->code
->num_temporaries
> compiler
->Base
.max_temp_regs
) {
543 rc_error(&compiler
->Base
, "Too many temporaries.\n");
547 compiler
->code
->length
+= 4;
549 if (compiler
->Base
.Error
)
554 struct temporary_allocation
{
555 unsigned int Allocated
:1;
556 unsigned int HwTemp
:15;
557 struct rc_instruction
* LastRead
;
560 static void allocate_temporary_registers(struct radeon_compiler
*c
, void *user
)
562 struct r300_vertex_program_compiler
*compiler
= (struct r300_vertex_program_compiler
*)c
;
563 struct rc_instruction
*inst
;
564 struct rc_instruction
*end_loop
= NULL
;
565 unsigned int num_orig_temps
= 0;
566 char hwtemps
[RC_REGISTER_MAX_INDEX
];
567 struct temporary_allocation
* ta
;
570 memset(hwtemps
, 0, sizeof(hwtemps
));
574 /* Pass 1: Count original temporaries. */
575 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
576 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
578 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
579 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
580 if (inst
->U
.I
.SrcReg
[i
].Index
>= num_orig_temps
)
581 num_orig_temps
= inst
->U
.I
.SrcReg
[i
].Index
+ 1;
585 if (opcode
->HasDstReg
) {
586 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
587 if (inst
->U
.I
.DstReg
.Index
>= num_orig_temps
)
588 num_orig_temps
= inst
->U
.I
.DstReg
.Index
+ 1;
593 ta
= (struct temporary_allocation
*)memory_pool_malloc(&compiler
->Base
.Pool
,
594 sizeof(struct temporary_allocation
) * num_orig_temps
);
595 memset(ta
, 0, sizeof(struct temporary_allocation
) * num_orig_temps
);
597 /* Pass 2: Determine original temporary lifetimes */
598 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
599 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
600 /* Instructions inside of loops need to use the ENDLOOP
601 * instruction as their LastRead. */
602 if (!end_loop
&& inst
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
604 struct rc_instruction
* ptr
;
605 for(ptr
= inst
->Next
;
606 ptr
!= &compiler
->Base
.Program
.Instructions
;
608 if (ptr
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
610 } else if (ptr
->U
.I
.Opcode
== RC_OPCODE_ENDLOOP
) {
620 if (inst
== end_loop
) {
625 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
626 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
627 ta
[inst
->U
.I
.SrcReg
[i
].Index
].LastRead
= end_loop
? end_loop
: inst
;
632 /* Pass 3: Register allocation */
633 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
634 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
636 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
637 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
638 unsigned int orig
= inst
->U
.I
.SrcReg
[i
].Index
;
639 inst
->U
.I
.SrcReg
[i
].Index
= ta
[orig
].HwTemp
;
641 if (ta
[orig
].Allocated
&& inst
== ta
[orig
].LastRead
)
642 hwtemps
[ta
[orig
].HwTemp
] = 0;
646 if (opcode
->HasDstReg
) {
647 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
648 unsigned int orig
= inst
->U
.I
.DstReg
.Index
;
650 if (!ta
[orig
].Allocated
) {
651 for(j
= 0; j
< c
->max_temp_regs
; ++j
) {
655 ta
[orig
].Allocated
= 1;
657 hwtemps
[ta
[orig
].HwTemp
] = 1;
660 inst
->U
.I
.DstReg
.Index
= ta
[orig
].HwTemp
;
667 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
668 * and the Saturate opcode modifier. Only Absolute is currently transformed.
670 static int transform_nonnative_modifiers(
671 struct radeon_compiler
*c
,
672 struct rc_instruction
*inst
,
675 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
678 /* Transform ABS(a) to MAX(a, -a). */
679 for (i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
680 if (inst
->U
.I
.SrcReg
[i
].Abs
) {
681 struct rc_instruction
*new_inst
;
684 inst
->U
.I
.SrcReg
[i
].Abs
= 0;
686 temp
= rc_find_free_temporary(c
);
688 new_inst
= rc_insert_new_instruction(c
, inst
->Prev
);
689 new_inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
690 new_inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
691 new_inst
->U
.I
.DstReg
.Index
= temp
;
692 new_inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[i
];
693 new_inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[i
];
694 new_inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
696 memset(&inst
->U
.I
.SrcReg
[i
], 0, sizeof(inst
->U
.I
.SrcReg
[i
]));
697 inst
->U
.I
.SrcReg
[i
].File
= RC_FILE_TEMPORARY
;
698 inst
->U
.I
.SrcReg
[i
].Index
= temp
;
699 inst
->U
.I
.SrcReg
[i
].Swizzle
= RC_SWIZZLE_XYZW
;
706 * Vertex engine cannot read two inputs or two constants at the same time.
707 * Introduce intermediate MOVs to temporary registers to account for this.
709 static int transform_source_conflicts(
710 struct radeon_compiler
*c
,
711 struct rc_instruction
* inst
,
714 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
716 if (opcode
->NumSrcRegs
== 3) {
717 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2])
718 || t_src_conflict(inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2])) {
719 int tmpreg
= rc_find_free_temporary(c
);
720 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
721 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
722 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
723 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
724 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
726 reset_srcreg(&inst
->U
.I
.SrcReg
[2]);
727 inst
->U
.I
.SrcReg
[2].File
= RC_FILE_TEMPORARY
;
728 inst
->U
.I
.SrcReg
[2].Index
= tmpreg
;
732 if (opcode
->NumSrcRegs
>= 2) {
733 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[0])) {
734 int tmpreg
= rc_find_free_temporary(c
);
735 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
736 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
737 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
738 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
739 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
741 reset_srcreg(&inst
->U
.I
.SrcReg
[1]);
742 inst
->U
.I
.SrcReg
[1].File
= RC_FILE_TEMPORARY
;
743 inst
->U
.I
.SrcReg
[1].Index
= tmpreg
;
750 static void rc_vs_add_artificial_outputs(struct radeon_compiler
*c
, void *user
)
752 struct r300_vertex_program_compiler
* compiler
= (struct r300_vertex_program_compiler
*)c
;
755 for(i
= 0; i
< 32; ++i
) {
756 if ((compiler
->RequiredOutputs
& (1 << i
)) &&
757 !(compiler
->Base
.Program
.OutputsWritten
& (1 << i
))) {
758 struct rc_instruction
* inst
= rc_insert_new_instruction(&compiler
->Base
, compiler
->Base
.Program
.Instructions
.Prev
);
759 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
761 inst
->U
.I
.DstReg
.File
= RC_FILE_OUTPUT
;
762 inst
->U
.I
.DstReg
.Index
= i
;
763 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
765 inst
->U
.I
.SrcReg
[0].File
= RC_FILE_CONSTANT
;
766 inst
->U
.I
.SrcReg
[0].Index
= 0;
767 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XYZW
;
769 compiler
->Base
.Program
.OutputsWritten
|= 1 << i
;
774 static void dataflow_outputs_mark_used(void * userdata
, void * data
,
775 void (*callback
)(void *, unsigned int, unsigned int))
777 struct r300_vertex_program_compiler
* c
= userdata
;
780 for(i
= 0; i
< 32; ++i
) {
781 if (c
->RequiredOutputs
& (1 << i
))
782 callback(data
, i
, RC_MASK_XYZW
);
786 static int swizzle_is_native(rc_opcode opcode
, struct rc_src_register reg
)
794 static void transform_negative_addressing(struct r300_vertex_program_compiler
*c
,
795 struct rc_instruction
*arl
,
796 struct rc_instruction
*end
,
799 struct rc_instruction
*inst
, *add
;
800 unsigned const_swizzle
;
802 /* Transform ARL/ARR */
803 add
= rc_insert_new_instruction(&c
->Base
, arl
->Prev
);
804 add
->U
.I
.Opcode
= RC_OPCODE_ADD
;
805 add
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
806 add
->U
.I
.DstReg
.Index
= rc_find_free_temporary(&c
->Base
);
807 add
->U
.I
.DstReg
.WriteMask
= RC_MASK_X
;
808 add
->U
.I
.SrcReg
[0] = arl
->U
.I
.SrcReg
[0];
809 add
->U
.I
.SrcReg
[1].File
= RC_FILE_CONSTANT
;
810 add
->U
.I
.SrcReg
[1].Index
= rc_constants_add_immediate_scalar(&c
->Base
.Program
.Constants
,
811 min_offset
, &const_swizzle
);
812 add
->U
.I
.SrcReg
[1].Swizzle
= const_swizzle
;
814 arl
->U
.I
.SrcReg
[0].File
= RC_FILE_TEMPORARY
;
815 arl
->U
.I
.SrcReg
[0].Index
= add
->U
.I
.DstReg
.Index
;
816 arl
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XXXX
;
818 /* Rewrite offsets up to and excluding inst. */
819 for (inst
= arl
->Next
; inst
!= end
; inst
= inst
->Next
) {
820 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
822 for (unsigned i
= 0; i
< opcode
->NumSrcRegs
; i
++)
823 if (inst
->U
.I
.SrcReg
[i
].RelAddr
)
824 inst
->U
.I
.SrcReg
[i
].Index
-= min_offset
;
828 static void rc_emulate_negative_addressing(struct radeon_compiler
*compiler
, void *user
)
830 struct r300_vertex_program_compiler
* c
= (struct r300_vertex_program_compiler
*)compiler
;
831 struct rc_instruction
*inst
, *lastARL
= NULL
;
834 for (inst
= c
->Base
.Program
.Instructions
.Next
; inst
!= &c
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
835 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
837 if (inst
->U
.I
.Opcode
== RC_OPCODE_ARL
|| inst
->U
.I
.Opcode
== RC_OPCODE_ARR
) {
838 if (lastARL
!= NULL
&& min_offset
< 0)
839 transform_negative_addressing(c
, lastARL
, inst
, min_offset
);
846 for (unsigned i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
847 if (inst
->U
.I
.SrcReg
[i
].RelAddr
&&
848 inst
->U
.I
.SrcReg
[i
].Index
< 0) {
849 /* ARL must precede any indirect addressing. */
851 rc_error(&c
->Base
, "Vertex shader: Found relative addressing without ARL/ARR.");
855 if (inst
->U
.I
.SrcReg
[i
].Index
< min_offset
)
856 min_offset
= inst
->U
.I
.SrcReg
[i
].Index
;
861 if (lastARL
!= NULL
&& min_offset
< 0)
862 transform_negative_addressing(c
, lastARL
, inst
, min_offset
);
865 struct rc_swizzle_caps r300_vertprog_swizzle_caps
= {
866 .IsNative
= &swizzle_is_native
,
867 .Split
= 0 /* should never be called */
870 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler
*c
)
872 int is_r500
= c
->Base
.is_r500
;
873 int opt
= !c
->Base
.disable_optimizations
;
875 /* Lists of instruction transformations. */
876 struct radeon_program_transformation alu_rewrite_r500
[] = {
877 { &r300_transform_vertex_alu
, 0 },
878 { &r300_transform_trig_scale_vertex
, 0 },
882 struct radeon_program_transformation alu_rewrite_r300
[] = {
883 { &r300_transform_vertex_alu
, 0 },
884 { &r300_transform_trig_simple
, 0 },
888 /* Note: These passes have to be done seperately from ALU rewrite,
889 * otherwise non-native ALU instructions with source conflits
890 * or non-native modifiers will not be treated properly.
892 struct radeon_program_transformation emulate_modifiers
[] = {
893 { &transform_nonnative_modifiers
, 0 },
897 struct radeon_program_transformation resolve_src_conflicts
[] = {
898 { &transform_source_conflicts
, 0 },
902 /* List of compiler passes. */
903 struct radeon_compiler_pass vs_list
[] = {
904 /* NAME DUMP PREDICATE FUNCTION PARAM */
905 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs
, NULL
},
906 {"emulate branches", 1, !is_r500
, rc_emulate_branches
, NULL
},
907 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing
, NULL
},
908 {"native rewrite", 1, is_r500
, rc_local_transform
, alu_rewrite_r500
},
909 {"native rewrite", 1, !is_r500
, rc_local_transform
, alu_rewrite_r300
},
910 {"emulate modifiers", 1, !is_r500
, rc_local_transform
, emulate_modifiers
},
911 {"deadcode", 1, opt
, rc_dataflow_deadcode
, dataflow_outputs_mark_used
},
912 {"dataflow optimize", 1, opt
, rc_optimize
, NULL
},
913 /* This pass must be done after optimizations. */
914 {"source conflict resolve", 1, 1, rc_local_transform
, resolve_src_conflicts
},
915 {"register allocation", 1, opt
, allocate_temporary_registers
, NULL
},
916 {"dead constants", 1, 1, rc_remove_unused_constants
, &c
->code
->constants_remap_table
},
917 {"lower control flow opcodes", 1, is_r500
, rc_vert_fc
, NULL
},
918 {"final code validation", 0, 1, rc_validate_final_shader
, NULL
},
919 {"machine code generation", 0, 1, translate_vertex_program
, NULL
},
920 {"dump machine code", 0, c
->Base
.Debug
& RC_DBG_LOG
, r300_vertex_program_dump
, NULL
},
921 {NULL
, 0, 0, NULL
, NULL
}
924 c
->Base
.type
= RC_VERTEX_PROGRAM
;
925 c
->Base
.SwizzleCaps
= &r300_vertprog_swizzle_caps
;
927 rc_run_compiler(&c
->Base
, vs_list
);
929 c
->code
->InputsRead
= c
->Base
.Program
.InputsRead
;
930 c
->code
->OutputsWritten
= c
->Base
.Program
.OutputsWritten
;
931 rc_constants_copy(&c
->code
->constants
, &c
->Base
.Program
.Constants
);