2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23 #include "radeon_compiler.h"
27 #include "../r300_reg.h"
29 #include "radeon_dataflow.h"
30 #include "radeon_program_alu.h"
31 #include "radeon_swizzle.h"
32 #include "radeon_emulate_branches.h"
33 #include "radeon_emulate_loops.h"
41 * Take an already-setup and valid source then swizzle it appropriately to
42 * obtain a constant ZERO or ONE source.
44 #define __CONST(x, y) \
45 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
50 t_src_class(vpi->SrcReg[x].File), \
51 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
54 static unsigned long t_dst_mask(unsigned int mask
)
56 /* RC_MASK_* is equivalent to VSF_FLAG_* */
57 return mask
& RC_MASK_XYZW
;
60 static unsigned long t_dst_class(rc_register_file file
)
64 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
66 case RC_FILE_TEMPORARY
:
67 return PVS_DST_REG_TEMPORARY
;
69 return PVS_DST_REG_OUT
;
71 return PVS_DST_REG_A0
;
75 static unsigned long t_dst_index(struct r300_vertex_program_code
*vp
,
76 struct rc_dst_register
*dst
)
78 if (dst
->File
== RC_FILE_OUTPUT
)
79 return vp
->outputs
[dst
->Index
];
84 static unsigned long t_src_class(rc_register_file file
)
88 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
91 case RC_FILE_TEMPORARY
:
92 return PVS_SRC_REG_TEMPORARY
;
94 return PVS_SRC_REG_INPUT
;
95 case RC_FILE_CONSTANT
:
96 return PVS_SRC_REG_CONSTANT
;
100 static int t_src_conflict(struct rc_src_register a
, struct rc_src_register b
)
102 unsigned long aclass
= t_src_class(a
.File
);
103 unsigned long bclass
= t_src_class(b
.File
);
105 if (aclass
!= bclass
)
107 if (aclass
== PVS_SRC_REG_TEMPORARY
)
110 if (a
.RelAddr
|| b
.RelAddr
)
112 if (a
.Index
!= b
.Index
)
118 static inline unsigned long t_swizzle(unsigned int swizzle
)
120 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
124 static unsigned long t_src_index(struct r300_vertex_program_code
*vp
,
125 struct rc_src_register
*src
)
127 if (src
->File
== RC_FILE_INPUT
) {
128 assert(vp
->inputs
[src
->Index
] != -1);
129 return vp
->inputs
[src
->Index
];
131 if (src
->Index
< 0) {
133 "negative offsets for indirect addressing do not work.\n");
140 /* these two functions should probably be merged... */
142 static unsigned long t_src(struct r300_vertex_program_code
*vp
,
143 struct rc_src_register
*src
)
145 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
149 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
150 t_swizzle(GET_SWZ(src
->Swizzle
, 1)),
151 t_swizzle(GET_SWZ(src
->Swizzle
, 2)),
152 t_swizzle(GET_SWZ(src
->Swizzle
, 3)),
153 t_src_class(src
->File
),
155 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
158 static unsigned long t_src_scalar(struct r300_vertex_program_code
*vp
,
159 struct rc_src_register
*src
)
161 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
162 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
165 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
166 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
167 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
168 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
169 t_src_class(src
->File
),
170 src
->Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
171 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
174 static int valid_dst(struct r300_vertex_program_code
*vp
,
175 struct rc_dst_register
*dst
)
177 if (dst
->File
== RC_FILE_OUTPUT
&& vp
->outputs
[dst
->Index
] == -1) {
179 } else if (dst
->File
== RC_FILE_ADDRESS
) {
180 assert(dst
->Index
== 0);
186 static void ei_vector1(struct r300_vertex_program_code
*vp
,
187 unsigned int hw_opcode
,
188 struct rc_sub_instruction
*vpi
,
191 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
194 t_dst_index(vp
, &vpi
->DstReg
),
195 t_dst_mask(vpi
->DstReg
.WriteMask
),
196 t_dst_class(vpi
->DstReg
.File
));
197 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
198 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
199 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
202 static void ei_vector2(struct r300_vertex_program_code
*vp
,
203 unsigned int hw_opcode
,
204 struct rc_sub_instruction
*vpi
,
207 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
210 t_dst_index(vp
, &vpi
->DstReg
),
211 t_dst_mask(vpi
->DstReg
.WriteMask
),
212 t_dst_class(vpi
->DstReg
.File
));
213 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
214 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
215 inst
[3] = __CONST(1, RC_SWIZZLE_ZERO
);
218 static void ei_math1(struct r300_vertex_program_code
*vp
,
219 unsigned int hw_opcode
,
220 struct rc_sub_instruction
*vpi
,
223 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
226 t_dst_index(vp
, &vpi
->DstReg
),
227 t_dst_mask(vpi
->DstReg
.WriteMask
),
228 t_dst_class(vpi
->DstReg
.File
));
229 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
230 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
231 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
234 static void ei_lit(struct r300_vertex_program_code
*vp
,
235 struct rc_sub_instruction
*vpi
,
238 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
240 inst
[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX
,
243 t_dst_index(vp
, &vpi
->DstReg
),
244 t_dst_mask(vpi
->DstReg
.WriteMask
),
245 t_dst_class(vpi
->DstReg
.File
));
246 /* NOTE: Users swizzling might not work. */
247 inst
[1] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
248 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
249 PVS_SRC_SELECT_FORCE_0
, // Z
250 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
251 t_src_class(vpi
->SrcReg
[0].File
),
252 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
253 (vpi
->SrcReg
[0].RelAddr
<< 4);
254 inst
[2] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
255 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
256 PVS_SRC_SELECT_FORCE_0
, // Z
257 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
258 t_src_class(vpi
->SrcReg
[0].File
),
259 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
260 (vpi
->SrcReg
[0].RelAddr
<< 4);
261 inst
[3] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
262 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
263 PVS_SRC_SELECT_FORCE_0
, // Z
264 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
265 t_src_class(vpi
->SrcReg
[0].File
),
266 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
267 (vpi
->SrcReg
[0].RelAddr
<< 4);
270 static void ei_mad(struct r300_vertex_program_code
*vp
,
271 struct rc_sub_instruction
*vpi
,
274 /* Remarks about hardware limitations of MAD
275 * (please preserve this comment, as this information is _NOT_
276 * in the documentation provided by AMD).
278 * As described in the documentation, MAD with three unique temporary
279 * source registers requires the use of the macro version.
281 * However (and this is not mentioned in the documentation), apparently
282 * the macro version is _NOT_ a full superset of the normal version.
283 * In particular, the macro version does not always work when relative
284 * addressing is used in the source operands.
286 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
287 * assembly shader path when using medium quality animations
288 * (i.e. animations with matrix blending instead of quaternion blending).
290 * Unfortunately, I (nha) have been unable to extract a Piglit regression
291 * test for this issue - for some reason, it is possible to have vertex
292 * programs whose prefix is *exactly* the same as the prefix of the
293 * offending program in Sauerbraten up to the offending instruction
294 * without causing any trouble.
296 * Bottom line: Only use the macro version only when really necessary;
297 * according to AMD docs, this should improve performance by one clock
298 * as a nice side bonus.
300 if (vpi
->SrcReg
[0].File
== RC_FILE_TEMPORARY
&&
301 vpi
->SrcReg
[1].File
== RC_FILE_TEMPORARY
&&
302 vpi
->SrcReg
[2].File
== RC_FILE_TEMPORARY
&&
303 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[1].Index
&&
304 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[2].Index
&&
305 vpi
->SrcReg
[1].Index
!= vpi
->SrcReg
[2].Index
) {
306 inst
[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD
,
309 t_dst_index(vp
, &vpi
->DstReg
),
310 t_dst_mask(vpi
->DstReg
.WriteMask
),
311 t_dst_class(vpi
->DstReg
.File
));
313 inst
[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD
,
316 t_dst_index(vp
, &vpi
->DstReg
),
317 t_dst_mask(vpi
->DstReg
.WriteMask
),
318 t_dst_class(vpi
->DstReg
.File
));
320 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
321 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
322 inst
[3] = t_src(vp
, &vpi
->SrcReg
[2]);
325 static void ei_pow(struct r300_vertex_program_code
*vp
,
326 struct rc_sub_instruction
*vpi
,
329 inst
[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF
,
332 t_dst_index(vp
, &vpi
->DstReg
),
333 t_dst_mask(vpi
->DstReg
.WriteMask
),
334 t_dst_class(vpi
->DstReg
.File
));
335 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
336 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
337 inst
[3] = t_src_scalar(vp
, &vpi
->SrcReg
[1]);
340 static void mark_write(void * userdata
, struct rc_instruction
* inst
,
341 rc_register_file file
, unsigned int index
, unsigned int mask
)
343 unsigned int * writemasks
= userdata
;
345 if (file
!= RC_FILE_TEMPORARY
)
348 if (index
>= R300_VS_MAX_TEMPS
)
351 writemasks
[index
] |= mask
;
354 static unsigned long t_pred_src(struct r300_vertex_program_compiler
* compiler
)
356 return PVS_SRC_OPERAND(compiler
->PredicateIndex
,
357 t_swizzle(RC_SWIZZLE_ZERO
),
358 t_swizzle(RC_SWIZZLE_ZERO
),
359 t_swizzle(RC_SWIZZLE_ZERO
),
360 t_swizzle(RC_SWIZZLE_W
),
361 t_src_class(RC_FILE_TEMPORARY
),
365 static unsigned long t_pred_dst(struct r300_vertex_program_compiler
* compiler
,
366 unsigned int hw_opcode
, int is_math
)
368 return PVS_OP_DST_OPERAND(hw_opcode
,
371 compiler
->PredicateIndex
,
373 t_dst_class(RC_FILE_TEMPORARY
));
377 static void ei_if(struct r300_vertex_program_compiler
* compiler
,
378 struct rc_instruction
*rci
,
380 unsigned int branch_depth
)
382 unsigned int predicate_opcode
;
385 if (!compiler
->Base
.is_r500
) {
386 rc_error(&compiler
->Base
,"Opcode IF not supported\n");
390 /* Reserve a temporary to use as our predicate stack counter, if we
391 * don't already have one. */
392 if (!compiler
->PredicateMask
) {
393 unsigned int writemasks
[R300_VS_MAX_TEMPS
];
394 memset(writemasks
, 0, sizeof(writemasks
));
395 struct rc_instruction
* inst
;
397 for(inst
= compiler
->Base
.Program
.Instructions
.Next
;
398 inst
!= &compiler
->Base
.Program
.Instructions
;
400 rc_for_all_writes_mask(inst
, mark_write
, writemasks
);
402 for(i
= 0; i
< R300_VS_MAX_TEMPS
; i
++) {
403 unsigned int mask
= ~writemasks
[i
] & RC_MASK_XYZW
;
404 /* Only the W component can be used fo the predicate
406 if (mask
& RC_MASK_W
) {
407 compiler
->PredicateMask
= RC_MASK_W
;
408 compiler
->PredicateIndex
= i
;
412 if (i
== R300_VS_MAX_TEMPS
) {
413 rc_error(&compiler
->Base
, "No free temporary to use for"
414 " predicate stack counter.\n");
419 branch_depth
? VE_PRED_SET_NEQ_PUSH
: ME_PRED_SET_NEQ
;
421 rci
->U
.I
.SrcReg
[0].Swizzle
= RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci
->U
.I
.SrcReg
[0].Swizzle
,0));
422 if (branch_depth
== 0) {
424 predicate_opcode
= ME_PRED_SET_NEQ
;
425 inst
[1] = t_src(compiler
->code
, &rci
->U
.I
.SrcReg
[0]);
428 predicate_opcode
= VE_PRED_SET_NEQ_PUSH
;
429 inst
[1] = t_pred_src(compiler
);
430 inst
[2] = t_src(compiler
->code
, &rci
->U
.I
.SrcReg
[0]);
433 inst
[0] = t_pred_dst(compiler
, predicate_opcode
, is_math
);
438 static void ei_else(struct r300_vertex_program_compiler
* compiler
,
441 if (!compiler
->Base
.is_r500
) {
442 rc_error(&compiler
->Base
,"Opcode ELSE not supported\n");
445 inst
[0] = t_pred_dst(compiler
, ME_PRED_SET_INV
, 1);
446 inst
[1] = t_pred_src(compiler
);
451 static void ei_endif(struct r300_vertex_program_compiler
*compiler
,
454 if (!compiler
->Base
.is_r500
) {
455 rc_error(&compiler
->Base
,"Opcode ENDIF not supported\n");
458 inst
[0] = t_pred_dst(compiler
, ME_PRED_SET_POP
, 1);
459 inst
[1] = t_pred_src(compiler
);
464 static void translate_vertex_program(struct r300_vertex_program_compiler
* compiler
)
466 struct rc_instruction
*rci
;
469 int current_loop_depth
= 0;
470 int loops_reserved
= 0;
472 unsigned int branch_depth
= 0;
474 compiler
->code
->pos_end
= 0; /* Not supported yet */
475 compiler
->code
->length
= 0;
477 compiler
->SetHwInputOutput(compiler
);
479 for(rci
= compiler
->Base
.Program
.Instructions
.Next
; rci
!= &compiler
->Base
.Program
.Instructions
; rci
= rci
->Next
) {
480 struct rc_sub_instruction
*vpi
= &rci
->U
.I
;
481 unsigned int *inst
= compiler
->code
->body
.d
+ compiler
->code
->length
;
483 /* Skip instructions writing to non-existing destination */
484 if (!valid_dst(compiler
->code
, &vpi
->DstReg
))
487 if (rc_get_opcode_info(vpi
->Opcode
)->HasDstReg
) {
488 /* Relative addressing of destination operands is not supported yet. */
489 if (vpi
->DstReg
.RelAddr
) {
490 rc_error(&compiler
->Base
, "Vertex program does not support relative "
491 "addressing of destination operands (yet).\n");
495 /* Neither is Saturate. */
496 if (vpi
->SaturateMode
!= RC_SATURATE_NONE
) {
497 rc_error(&compiler
->Base
, "Vertex program does not support the Saturate "
498 "modifier (yet).\n");
502 if (compiler
->code
->length
>= R500_VS_MAX_ALU_DWORDS
||
503 (compiler
->code
->length
>= R300_VS_MAX_ALU_DWORDS
&& !compiler
->Base
.is_r500
)) {
504 rc_error(&compiler
->Base
, "Vertex program has too many instructions\n");
508 assert(compiler
->Base
.is_r500
||
509 (vpi
->Opcode
!= RC_OPCODE_SEQ
&&
510 vpi
->Opcode
!= RC_OPCODE_SNE
));
512 switch (vpi
->Opcode
) {
513 case RC_OPCODE_ADD
: ei_vector2(compiler
->code
, VE_ADD
, vpi
, inst
); break;
514 case RC_OPCODE_ARL
: ei_vector1(compiler
->code
, VE_FLT2FIX_DX
, vpi
, inst
); break;
515 case RC_OPCODE_COS
: ei_math1(compiler
->code
, ME_COS
, vpi
, inst
); break;
516 case RC_OPCODE_DP4
: ei_vector2(compiler
->code
, VE_DOT_PRODUCT
, vpi
, inst
); break;
517 case RC_OPCODE_DST
: ei_vector2(compiler
->code
, VE_DISTANCE_VECTOR
, vpi
, inst
); break;
518 case RC_OPCODE_ELSE
: ei_else(compiler
, inst
); break;
519 case RC_OPCODE_ENDIF
: ei_endif(compiler
, inst
); branch_depth
--; break;
520 case RC_OPCODE_EX2
: ei_math1(compiler
->code
, ME_EXP_BASE2_FULL_DX
, vpi
, inst
); break;
521 case RC_OPCODE_EXP
: ei_math1(compiler
->code
, ME_EXP_BASE2_DX
, vpi
, inst
); break;
522 case RC_OPCODE_FRC
: ei_vector1(compiler
->code
, VE_FRACTION
, vpi
, inst
); break;
523 case RC_OPCODE_IF
: ei_if(compiler
, rci
, inst
, branch_depth
); branch_depth
++; break;
524 case RC_OPCODE_LG2
: ei_math1(compiler
->code
, ME_LOG_BASE2_FULL_DX
, vpi
, inst
); break;
525 case RC_OPCODE_LIT
: ei_lit(compiler
->code
, vpi
, inst
); break;
526 case RC_OPCODE_LOG
: ei_math1(compiler
->code
, ME_LOG_BASE2_DX
, vpi
, inst
); break;
527 case RC_OPCODE_MAD
: ei_mad(compiler
->code
, vpi
, inst
); break;
528 case RC_OPCODE_MAX
: ei_vector2(compiler
->code
, VE_MAXIMUM
, vpi
, inst
); break;
529 case RC_OPCODE_MIN
: ei_vector2(compiler
->code
, VE_MINIMUM
, vpi
, inst
); break;
530 case RC_OPCODE_MOV
: ei_vector1(compiler
->code
, VE_ADD
, vpi
, inst
); break;
531 case RC_OPCODE_MUL
: ei_vector2(compiler
->code
, VE_MULTIPLY
, vpi
, inst
); break;
532 case RC_OPCODE_POW
: ei_pow(compiler
->code
, vpi
, inst
); break;
533 case RC_OPCODE_RCP
: ei_math1(compiler
->code
, ME_RECIP_DX
, vpi
, inst
); break;
534 case RC_OPCODE_RSQ
: ei_math1(compiler
->code
, ME_RECIP_SQRT_DX
, vpi
, inst
); break;
535 case RC_OPCODE_SEQ
: ei_vector2(compiler
->code
, VE_SET_EQUAL
, vpi
, inst
); break;
536 case RC_OPCODE_SGE
: ei_vector2(compiler
->code
, VE_SET_GREATER_THAN_EQUAL
, vpi
, inst
); break;
537 case RC_OPCODE_SIN
: ei_math1(compiler
->code
, ME_SIN
, vpi
, inst
); break;
538 case RC_OPCODE_SLT
: ei_vector2(compiler
->code
, VE_SET_LESS_THAN
, vpi
, inst
); break;
539 case RC_OPCODE_SNE
: ei_vector2(compiler
->code
, VE_SET_NOT_EQUAL
, vpi
, inst
); break;
540 case RC_OPCODE_BGNLOOP
:
544 if ((!compiler
->Base
.is_r500
545 && loops_reserved
>= R300_VS_MAX_LOOP_DEPTH
)
546 || loops_reserved
>= R500_VS_MAX_FC_DEPTH
) {
547 rc_error(&compiler
->Base
,
548 "Loops are nested too deep.");
551 memory_pool_array_reserve(&compiler
->Base
.Pool
,
552 struct loop
, loops
, current_loop_depth
,
554 l
= &loops
[current_loop_depth
++];
555 memset(l
, 0, sizeof(struct loop
));
556 l
->BgnLoop
= (compiler
->code
->length
/ 4);
559 case RC_OPCODE_ENDLOOP
:
561 struct loop
* l
= &loops
[current_loop_depth
- 1];
562 unsigned int act_addr
= l
->BgnLoop
- 1;
563 unsigned int last_addr
= (compiler
->code
->length
/ 4) - 1;
564 unsigned int ret_addr
= l
->BgnLoop
;
566 if (loops_reserved
>= R300_VS_MAX_FC_OPS
) {
567 rc_error(&compiler
->Base
,
568 "Too many flow control instructions.");
571 if (compiler
->Base
.is_r500
) {
572 compiler
->code
->fc_op_addrs
.r500
573 [compiler
->code
->num_fc_ops
].lw
=
574 R500_PVS_FC_ACT_ADRS(act_addr
)
575 | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
577 compiler
->code
->fc_op_addrs
.r500
578 [compiler
->code
->num_fc_ops
].uw
=
579 R500_PVS_FC_LAST_INST(last_addr
)
580 | R500_PVS_FC_RTN_INST(ret_addr
)
583 compiler
->code
->fc_op_addrs
.r300
584 [compiler
->code
->num_fc_ops
] =
585 R300_PVS_FC_ACT_ADRS(act_addr
)
586 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
587 | R300_PVS_FC_LAST_INST(last_addr
)
588 | R300_PVS_FC_RTN_INST(ret_addr
)
591 compiler
->code
->fc_loop_index
[compiler
->code
->num_fc_ops
] =
592 R300_PVS_FC_LOOP_INIT_VAL(0x0)
593 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
595 compiler
->code
->fc_ops
|= R300_VAP_PVS_FC_OPC_LOOP(
596 compiler
->code
->num_fc_ops
);
597 compiler
->code
->num_fc_ops
++;
598 current_loop_depth
--;
603 rc_error(&compiler
->Base
, "Unknown opcode %s\n", rc_get_opcode_info(vpi
->Opcode
)->Name
);
607 /* Non-flow control instructions that are inside an if statement
608 * need to pay attention to the predicate bit. */
610 && vpi
->Opcode
!= RC_OPCODE_IF
611 && vpi
->Opcode
!= RC_OPCODE_ELSE
612 && vpi
->Opcode
!= RC_OPCODE_ENDIF
) {
614 inst
[0] |= (PVS_DST_PRED_ENABLE_MASK
615 << PVS_DST_PRED_ENABLE_SHIFT
);
616 inst
[0] |= (PVS_DST_PRED_SENSE_MASK
617 << PVS_DST_PRED_SENSE_SHIFT
);
620 compiler
->code
->length
+= 4;
622 if (compiler
->Base
.Error
)
627 struct temporary_allocation
{
628 unsigned int Allocated
:1;
629 unsigned int HwTemp
:15;
630 struct rc_instruction
* LastRead
;
633 static void allocate_temporary_registers(struct r300_vertex_program_compiler
* compiler
)
635 struct rc_instruction
*inst
;
636 struct rc_instruction
*end_loop
= NULL
;
637 unsigned int num_orig_temps
= 0;
638 char hwtemps
[R300_VS_MAX_TEMPS
];
639 struct temporary_allocation
* ta
;
642 memset(hwtemps
, 0, sizeof(hwtemps
));
644 /* Pass 1: Count original temporaries. */
645 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
646 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
648 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
649 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
650 if (inst
->U
.I
.SrcReg
[i
].Index
>= num_orig_temps
)
651 num_orig_temps
= inst
->U
.I
.SrcReg
[i
].Index
+ 1;
655 if (opcode
->HasDstReg
) {
656 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
657 if (inst
->U
.I
.DstReg
.Index
>= num_orig_temps
)
658 num_orig_temps
= inst
->U
.I
.DstReg
.Index
+ 1;
662 compiler
->code
->num_temporaries
= num_orig_temps
;
664 /* Pass 2: If there is relative addressing of temporaries, we cannot change register indices. Give up. */
665 for (inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
666 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
668 if (opcode
->HasDstReg
)
669 if (inst
->U
.I
.DstReg
.RelAddr
)
672 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
673 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
674 inst
->U
.I
.SrcReg
[i
].RelAddr
) {
680 compiler
->code
->num_temporaries
= 0;
681 ta
= (struct temporary_allocation
*)memory_pool_malloc(&compiler
->Base
.Pool
,
682 sizeof(struct temporary_allocation
) * num_orig_temps
);
683 memset(ta
, 0, sizeof(struct temporary_allocation
) * num_orig_temps
);
685 /* Pass 3: Determine original temporary lifetimes */
686 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
687 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
688 /* Instructions inside of loops need to use the ENDLOOP
689 * instruction as their LastRead. */
690 if (!end_loop
&& inst
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
692 struct rc_instruction
* ptr
;
693 for(ptr
= inst
->Next
;
694 ptr
!= &compiler
->Base
.Program
.Instructions
;
696 if (ptr
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
) {
698 } else if (ptr
->U
.I
.Opcode
== RC_OPCODE_ENDLOOP
) {
708 if (inst
== end_loop
) {
713 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
714 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
)
715 ta
[inst
->U
.I
.SrcReg
[i
].Index
].LastRead
=
716 end_loop
? end_loop
: inst
;
720 /* Pass 4: Register allocation */
721 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
722 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
724 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
725 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
726 unsigned int orig
= inst
->U
.I
.SrcReg
[i
].Index
;
727 inst
->U
.I
.SrcReg
[i
].Index
= ta
[orig
].HwTemp
;
729 if (ta
[orig
].Allocated
&& inst
== ta
[orig
].LastRead
)
730 hwtemps
[ta
[orig
].HwTemp
] = 0;
734 if (opcode
->HasDstReg
) {
735 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
736 unsigned int orig
= inst
->U
.I
.DstReg
.Index
;
738 if (!ta
[orig
].Allocated
) {
739 for(j
= 0; j
< R300_VS_MAX_TEMPS
; ++j
) {
743 if (j
>= R300_VS_MAX_TEMPS
) {
744 fprintf(stderr
, "Out of hw temporaries\n");
746 ta
[orig
].Allocated
= 1;
750 if (j
>= compiler
->code
->num_temporaries
)
751 compiler
->code
->num_temporaries
= j
+ 1;
755 inst
->U
.I
.DstReg
.Index
= ta
[orig
].HwTemp
;
762 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
763 * and the Saturate opcode modifier. Only Absolute is currently transformed.
765 static int transform_nonnative_modifiers(
766 struct radeon_compiler
*c
,
767 struct rc_instruction
*inst
,
770 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
773 /* Transform ABS(a) to MAX(a, -a). */
774 for (i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
775 if (inst
->U
.I
.SrcReg
[i
].Abs
) {
776 struct rc_instruction
*new_inst
;
779 inst
->U
.I
.SrcReg
[i
].Abs
= 0;
781 temp
= rc_find_free_temporary(c
);
783 new_inst
= rc_insert_new_instruction(c
, inst
->Prev
);
784 new_inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
785 new_inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
786 new_inst
->U
.I
.DstReg
.Index
= temp
;
787 new_inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[i
];
788 new_inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[i
];
789 new_inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
791 memset(&inst
->U
.I
.SrcReg
[i
], 0, sizeof(inst
->U
.I
.SrcReg
[i
]));
792 inst
->U
.I
.SrcReg
[i
].File
= RC_FILE_TEMPORARY
;
793 inst
->U
.I
.SrcReg
[i
].Index
= temp
;
794 inst
->U
.I
.SrcReg
[i
].Swizzle
= RC_SWIZZLE_XYZW
;
801 * Vertex engine cannot read two inputs or two constants at the same time.
802 * Introduce intermediate MOVs to temporary registers to account for this.
804 static int transform_source_conflicts(
805 struct radeon_compiler
*c
,
806 struct rc_instruction
* inst
,
809 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
811 if (opcode
->NumSrcRegs
== 3) {
812 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2])
813 || t_src_conflict(inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2])) {
814 int tmpreg
= rc_find_free_temporary(c
);
815 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
816 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
817 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
818 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
819 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
821 reset_srcreg(&inst
->U
.I
.SrcReg
[2]);
822 inst
->U
.I
.SrcReg
[2].File
= RC_FILE_TEMPORARY
;
823 inst
->U
.I
.SrcReg
[2].Index
= tmpreg
;
827 if (opcode
->NumSrcRegs
>= 2) {
828 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[0])) {
829 int tmpreg
= rc_find_free_temporary(c
);
830 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
831 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
832 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
833 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
834 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
836 reset_srcreg(&inst
->U
.I
.SrcReg
[1]);
837 inst
->U
.I
.SrcReg
[1].File
= RC_FILE_TEMPORARY
;
838 inst
->U
.I
.SrcReg
[1].Index
= tmpreg
;
845 static void addArtificialOutputs(struct r300_vertex_program_compiler
* compiler
)
849 for(i
= 0; i
< 32; ++i
) {
850 if ((compiler
->RequiredOutputs
& (1 << i
)) &&
851 !(compiler
->Base
.Program
.OutputsWritten
& (1 << i
))) {
852 struct rc_instruction
* inst
= rc_insert_new_instruction(&compiler
->Base
, compiler
->Base
.Program
.Instructions
.Prev
);
853 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
855 inst
->U
.I
.DstReg
.File
= RC_FILE_OUTPUT
;
856 inst
->U
.I
.DstReg
.Index
= i
;
857 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
859 inst
->U
.I
.SrcReg
[0].File
= RC_FILE_CONSTANT
;
860 inst
->U
.I
.SrcReg
[0].Index
= 0;
861 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XYZW
;
863 compiler
->Base
.Program
.OutputsWritten
|= 1 << i
;
868 static void dataflow_outputs_mark_used(void * userdata
, void * data
,
869 void (*callback
)(void *, unsigned int, unsigned int))
871 struct r300_vertex_program_compiler
* c
= userdata
;
874 for(i
= 0; i
< 32; ++i
) {
875 if (c
->RequiredOutputs
& (1 << i
))
876 callback(data
, i
, RC_MASK_XYZW
);
880 static int swizzle_is_native(rc_opcode opcode
, struct rc_src_register reg
)
888 static void debug_program_log(struct r300_vertex_program_compiler
* c
, const char * where
)
891 fprintf(stderr
, "Vertex Program: %s\n", where
);
892 rc_print_program(&c
->Base
.Program
);
897 static struct rc_swizzle_caps r300_vertprog_swizzle_caps
= {
898 .IsNative
= &swizzle_is_native
,
899 .Split
= 0 /* should never be called */
903 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler
*c
)
905 struct emulate_loop_state loop_state
;
907 c
->Base
.SwizzleCaps
= &r300_vertprog_swizzle_caps
;
909 addArtificialOutputs(c
);
911 debug_program_log(c
, "before compilation");
914 rc_transform_loops(&c
->Base
, &loop_state
, R500_VS_MAX_ALU
);
916 rc_transform_loops(&c
->Base
, &loop_state
, R300_VS_MAX_ALU
);
920 debug_program_log(c
, "after emulate loops");
922 if (!c
->Base
.is_r500
) {
923 rc_emulate_branches(&c
->Base
);
926 debug_program_log(c
, "after emulate branches");
929 if (c
->Base
.is_r500
) {
930 struct radeon_program_transformation transformations
[] = {
931 { &r300_transform_vertex_alu
, 0 },
932 { &r300_transform_trig_scale_vertex
, 0 }
934 radeonLocalTransform(&c
->Base
, 2, transformations
);
938 debug_program_log(c
, "after native rewrite");
940 struct radeon_program_transformation transformations
[] = {
941 { &r300_transform_vertex_alu
, 0 },
942 { &radeonTransformTrigSimple
, 0 }
944 radeonLocalTransform(&c
->Base
, 2, transformations
);
948 debug_program_log(c
, "after native rewrite");
950 /* Note: This pass has to be done seperately from ALU rewrite,
951 * because it needs to check every instruction.
953 struct radeon_program_transformation transformations2
[] = {
954 { &transform_nonnative_modifiers
, 0 },
956 radeonLocalTransform(&c
->Base
, 1, transformations2
);
960 debug_program_log(c
, "after emulate modifiers");
964 /* Note: This pass has to be done seperately from ALU rewrite,
965 * otherwise non-native ALU instructions with source conflits
966 * will not be treated properly.
968 struct radeon_program_transformation transformations
[] = {
969 { &transform_source_conflicts
, 0 },
971 radeonLocalTransform(&c
->Base
, 1, transformations
);
976 debug_program_log(c
, "after source conflict resolve");
978 rc_dataflow_deadcode(&c
->Base
, &dataflow_outputs_mark_used
, c
);
982 debug_program_log(c
, "after deadcode");
984 rc_dataflow_swizzles(&c
->Base
);
988 debug_program_log(c
, "after dataflow");
990 allocate_temporary_registers(c
);
994 debug_program_log(c
, "after register allocation");
997 translate_vertex_program(c
);
1001 rc_constants_copy(&c
->code
->constants
, &c
->Base
.Program
.Constants
);
1003 c
->code
->InputsRead
= c
->Base
.Program
.InputsRead
;
1004 c
->code
->OutputsWritten
= c
->Base
.Program
.OutputsWritten
;
1006 if (c
->Base
.Debug
) {
1007 fprintf(stderr
, "Final vertex program code:\n");
1008 r300_vertex_program_dump(c
);
1011 /* Check the number of constants. */
1012 if (!c
->Base
.Error
&&
1013 c
->Base
.Program
.Constants
.Count
> 256) {
1014 rc_error(&c
->Base
, "Too many constants. Max: 256, Got: %i\n",
1015 c
->Base
.Program
.Constants
.Count
);