2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23 #include "radeon_compiler.h"
27 #include "../r300_reg.h"
29 #include "radeon_dataflow.h"
30 #include "radeon_program_alu.h"
31 #include "radeon_swizzle.h"
32 #include "radeon_emulate_branches.h"
33 #include "radeon_emulate_loops.h"
36 * Take an already-setup and valid source then swizzle it appropriately to
37 * obtain a constant ZERO or ONE source.
39 #define __CONST(x, y) \
40 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
45 t_src_class(vpi->SrcReg[x].File), \
46 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
49 static unsigned long t_dst_mask(unsigned int mask
)
51 /* RC_MASK_* is equivalent to VSF_FLAG_* */
52 return mask
& RC_MASK_XYZW
;
55 static unsigned long t_dst_class(rc_register_file file
)
59 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
61 case RC_FILE_TEMPORARY
:
62 return PVS_DST_REG_TEMPORARY
;
64 return PVS_DST_REG_OUT
;
66 return PVS_DST_REG_A0
;
70 static unsigned long t_dst_index(struct r300_vertex_program_code
*vp
,
71 struct rc_dst_register
*dst
)
73 if (dst
->File
== RC_FILE_OUTPUT
)
74 return vp
->outputs
[dst
->Index
];
79 static unsigned long t_src_class(rc_register_file file
)
83 fprintf(stderr
, "%s: Bad register file %i\n", __FUNCTION__
, file
);
86 case RC_FILE_TEMPORARY
:
87 return PVS_SRC_REG_TEMPORARY
;
89 return PVS_SRC_REG_INPUT
;
90 case RC_FILE_CONSTANT
:
91 return PVS_SRC_REG_CONSTANT
;
95 static int t_src_conflict(struct rc_src_register a
, struct rc_src_register b
)
97 unsigned long aclass
= t_src_class(a
.File
);
98 unsigned long bclass
= t_src_class(b
.File
);
100 if (aclass
!= bclass
)
102 if (aclass
== PVS_SRC_REG_TEMPORARY
)
105 if (a
.RelAddr
|| b
.RelAddr
)
107 if (a
.Index
!= b
.Index
)
113 static inline unsigned long t_swizzle(unsigned int swizzle
)
115 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
119 static unsigned long t_src_index(struct r300_vertex_program_code
*vp
,
120 struct rc_src_register
*src
)
122 if (src
->File
== RC_FILE_INPUT
) {
123 assert(vp
->inputs
[src
->Index
] != -1);
124 return vp
->inputs
[src
->Index
];
126 if (src
->Index
< 0) {
128 "negative offsets for indirect addressing do not work.\n");
135 /* these two functions should probably be merged... */
137 static unsigned long t_src(struct r300_vertex_program_code
*vp
,
138 struct rc_src_register
*src
)
140 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
141 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
143 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
144 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
145 t_swizzle(GET_SWZ(src
->Swizzle
, 1)),
146 t_swizzle(GET_SWZ(src
->Swizzle
, 2)),
147 t_swizzle(GET_SWZ(src
->Swizzle
, 3)),
148 t_src_class(src
->File
),
150 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
153 static unsigned long t_src_scalar(struct r300_vertex_program_code
*vp
,
154 struct rc_src_register
*src
)
156 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
157 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
159 return PVS_SRC_OPERAND(t_src_index(vp
, src
),
160 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
161 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
162 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
163 t_swizzle(GET_SWZ(src
->Swizzle
, 0)),
164 t_src_class(src
->File
),
165 src
->Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
166 (src
->RelAddr
<< 4) | (src
->Abs
<< 3);
169 static int valid_dst(struct r300_vertex_program_code
*vp
,
170 struct rc_dst_register
*dst
)
172 if (dst
->File
== RC_FILE_OUTPUT
&& vp
->outputs
[dst
->Index
] == -1) {
174 } else if (dst
->File
== RC_FILE_ADDRESS
) {
175 assert(dst
->Index
== 0);
181 static void ei_vector1(struct r300_vertex_program_code
*vp
,
182 unsigned int hw_opcode
,
183 struct rc_sub_instruction
*vpi
,
186 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
189 t_dst_index(vp
, &vpi
->DstReg
),
190 t_dst_mask(vpi
->DstReg
.WriteMask
),
191 t_dst_class(vpi
->DstReg
.File
));
192 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
193 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
194 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
197 static void ei_vector2(struct r300_vertex_program_code
*vp
,
198 unsigned int hw_opcode
,
199 struct rc_sub_instruction
*vpi
,
202 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
205 t_dst_index(vp
, &vpi
->DstReg
),
206 t_dst_mask(vpi
->DstReg
.WriteMask
),
207 t_dst_class(vpi
->DstReg
.File
));
208 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
209 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
210 inst
[3] = __CONST(1, RC_SWIZZLE_ZERO
);
213 static void ei_math1(struct r300_vertex_program_code
*vp
,
214 unsigned int hw_opcode
,
215 struct rc_sub_instruction
*vpi
,
218 inst
[0] = PVS_OP_DST_OPERAND(hw_opcode
,
221 t_dst_index(vp
, &vpi
->DstReg
),
222 t_dst_mask(vpi
->DstReg
.WriteMask
),
223 t_dst_class(vpi
->DstReg
.File
));
224 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
225 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
226 inst
[3] = __CONST(0, RC_SWIZZLE_ZERO
);
229 static void ei_lit(struct r300_vertex_program_code
*vp
,
230 struct rc_sub_instruction
*vpi
,
233 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
235 inst
[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX
,
238 t_dst_index(vp
, &vpi
->DstReg
),
239 t_dst_mask(vpi
->DstReg
.WriteMask
),
240 t_dst_class(vpi
->DstReg
.File
));
241 /* NOTE: Users swizzling might not work. */
242 inst
[1] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
243 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
244 PVS_SRC_SELECT_FORCE_0
, // Z
245 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
246 t_src_class(vpi
->SrcReg
[0].File
),
247 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
248 (vpi
->SrcReg
[0].RelAddr
<< 4);
249 inst
[2] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
250 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
251 PVS_SRC_SELECT_FORCE_0
, // Z
252 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
253 t_src_class(vpi
->SrcReg
[0].File
),
254 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
255 (vpi
->SrcReg
[0].RelAddr
<< 4);
256 inst
[3] = PVS_SRC_OPERAND(t_src_index(vp
, &vpi
->SrcReg
[0]), t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 1)), // Y
257 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 0)), // X
258 PVS_SRC_SELECT_FORCE_0
, // Z
259 t_swizzle(GET_SWZ(vpi
->SrcReg
[0].Swizzle
, 3)), // W
260 t_src_class(vpi
->SrcReg
[0].File
),
261 vpi
->SrcReg
[0].Negate
? RC_MASK_XYZW
: RC_MASK_NONE
) |
262 (vpi
->SrcReg
[0].RelAddr
<< 4);
265 static void ei_mad(struct r300_vertex_program_code
*vp
,
266 struct rc_sub_instruction
*vpi
,
269 /* Remarks about hardware limitations of MAD
270 * (please preserve this comment, as this information is _NOT_
271 * in the documentation provided by AMD).
273 * As described in the documentation, MAD with three unique temporary
274 * source registers requires the use of the macro version.
276 * However (and this is not mentioned in the documentation), apparently
277 * the macro version is _NOT_ a full superset of the normal version.
278 * In particular, the macro version does not always work when relative
279 * addressing is used in the source operands.
281 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
282 * assembly shader path when using medium quality animations
283 * (i.e. animations with matrix blending instead of quaternion blending).
285 * Unfortunately, I (nha) have been unable to extract a Piglit regression
286 * test for this issue - for some reason, it is possible to have vertex
287 * programs whose prefix is *exactly* the same as the prefix of the
288 * offending program in Sauerbraten up to the offending instruction
289 * without causing any trouble.
291 * Bottom line: Only use the macro version only when really necessary;
292 * according to AMD docs, this should improve performance by one clock
293 * as a nice side bonus.
295 if (vpi
->SrcReg
[0].File
== RC_FILE_TEMPORARY
&&
296 vpi
->SrcReg
[1].File
== RC_FILE_TEMPORARY
&&
297 vpi
->SrcReg
[2].File
== RC_FILE_TEMPORARY
&&
298 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[1].Index
&&
299 vpi
->SrcReg
[0].Index
!= vpi
->SrcReg
[2].Index
&&
300 vpi
->SrcReg
[1].Index
!= vpi
->SrcReg
[2].Index
) {
301 inst
[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD
,
304 t_dst_index(vp
, &vpi
->DstReg
),
305 t_dst_mask(vpi
->DstReg
.WriteMask
),
306 t_dst_class(vpi
->DstReg
.File
));
308 inst
[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD
,
311 t_dst_index(vp
, &vpi
->DstReg
),
312 t_dst_mask(vpi
->DstReg
.WriteMask
),
313 t_dst_class(vpi
->DstReg
.File
));
315 inst
[1] = t_src(vp
, &vpi
->SrcReg
[0]);
316 inst
[2] = t_src(vp
, &vpi
->SrcReg
[1]);
317 inst
[3] = t_src(vp
, &vpi
->SrcReg
[2]);
320 static void ei_pow(struct r300_vertex_program_code
*vp
,
321 struct rc_sub_instruction
*vpi
,
324 inst
[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF
,
327 t_dst_index(vp
, &vpi
->DstReg
),
328 t_dst_mask(vpi
->DstReg
.WriteMask
),
329 t_dst_class(vpi
->DstReg
.File
));
330 inst
[1] = t_src_scalar(vp
, &vpi
->SrcReg
[0]);
331 inst
[2] = __CONST(0, RC_SWIZZLE_ZERO
);
332 inst
[3] = t_src_scalar(vp
, &vpi
->SrcReg
[1]);
336 static void translate_vertex_program(struct r300_vertex_program_compiler
* compiler
)
338 struct rc_instruction
*rci
;
340 compiler
->code
->pos_end
= 0; /* Not supported yet */
341 compiler
->code
->length
= 0;
343 compiler
->SetHwInputOutput(compiler
);
345 for(rci
= compiler
->Base
.Program
.Instructions
.Next
; rci
!= &compiler
->Base
.Program
.Instructions
; rci
= rci
->Next
) {
346 struct rc_sub_instruction
*vpi
= &rci
->U
.I
;
347 unsigned int *inst
= compiler
->code
->body
.d
+ compiler
->code
->length
;
349 /* Skip instructions writing to non-existing destination */
350 if (!valid_dst(compiler
->code
, &vpi
->DstReg
))
353 if (compiler
->code
->length
>= R500_VS_MAX_ALU_DWORDS
||
354 (compiler
->code
->length
>= R300_VS_MAX_ALU_DWORDS
&& !compiler
->Base
.is_r500
)) {
355 rc_error(&compiler
->Base
, "Vertex program has too many instructions\n");
359 assert(compiler
->Base
.is_r500
||
360 (vpi
->Opcode
!= RC_OPCODE_SEQ
&&
361 vpi
->Opcode
!= RC_OPCODE_SNE
));
363 switch (vpi
->Opcode
) {
364 case RC_OPCODE_ADD
: ei_vector2(compiler
->code
, VE_ADD
, vpi
, inst
); break;
365 case RC_OPCODE_ARL
: ei_vector1(compiler
->code
, VE_FLT2FIX_DX
, vpi
, inst
); break;
366 case RC_OPCODE_COS
: ei_math1(compiler
->code
, ME_COS
, vpi
, inst
); break;
367 case RC_OPCODE_DP4
: ei_vector2(compiler
->code
, VE_DOT_PRODUCT
, vpi
, inst
); break;
368 case RC_OPCODE_DST
: ei_vector2(compiler
->code
, VE_DISTANCE_VECTOR
, vpi
, inst
); break;
369 case RC_OPCODE_EX2
: ei_math1(compiler
->code
, ME_EXP_BASE2_FULL_DX
, vpi
, inst
); break;
370 case RC_OPCODE_EXP
: ei_math1(compiler
->code
, ME_EXP_BASE2_DX
, vpi
, inst
); break;
371 case RC_OPCODE_FRC
: ei_vector1(compiler
->code
, VE_FRACTION
, vpi
, inst
); break;
372 case RC_OPCODE_LG2
: ei_math1(compiler
->code
, ME_LOG_BASE2_FULL_DX
, vpi
, inst
); break;
373 case RC_OPCODE_LIT
: ei_lit(compiler
->code
, vpi
, inst
); break;
374 case RC_OPCODE_LOG
: ei_math1(compiler
->code
, ME_LOG_BASE2_DX
, vpi
, inst
); break;
375 case RC_OPCODE_MAD
: ei_mad(compiler
->code
, vpi
, inst
); break;
376 case RC_OPCODE_MAX
: ei_vector2(compiler
->code
, VE_MAXIMUM
, vpi
, inst
); break;
377 case RC_OPCODE_MIN
: ei_vector2(compiler
->code
, VE_MINIMUM
, vpi
, inst
); break;
378 case RC_OPCODE_MOV
: ei_vector1(compiler
->code
, VE_ADD
, vpi
, inst
); break;
379 case RC_OPCODE_MUL
: ei_vector2(compiler
->code
, VE_MULTIPLY
, vpi
, inst
); break;
380 case RC_OPCODE_POW
: ei_pow(compiler
->code
, vpi
, inst
); break;
381 case RC_OPCODE_RCP
: ei_math1(compiler
->code
, ME_RECIP_DX
, vpi
, inst
); break;
382 case RC_OPCODE_RSQ
: ei_math1(compiler
->code
, ME_RECIP_SQRT_DX
, vpi
, inst
); break;
383 case RC_OPCODE_SEQ
: ei_vector2(compiler
->code
, VE_SET_EQUAL
, vpi
, inst
); break;
384 case RC_OPCODE_SGE
: ei_vector2(compiler
->code
, VE_SET_GREATER_THAN_EQUAL
, vpi
, inst
); break;
385 case RC_OPCODE_SIN
: ei_math1(compiler
->code
, ME_SIN
, vpi
, inst
); break;
386 case RC_OPCODE_SLT
: ei_vector2(compiler
->code
, VE_SET_LESS_THAN
, vpi
, inst
); break;
387 case RC_OPCODE_SNE
: ei_vector2(compiler
->code
, VE_SET_NOT_EQUAL
, vpi
, inst
); break;
389 rc_error(&compiler
->Base
, "Unknown opcode %s\n", rc_get_opcode_info(vpi
->Opcode
)->Name
);
393 compiler
->code
->length
+= 4;
395 if (compiler
->Base
.Error
)
400 struct temporary_allocation
{
401 unsigned int Allocated
:1;
402 unsigned int HwTemp
:15;
403 struct rc_instruction
* LastRead
;
406 static void allocate_temporary_registers(struct r300_vertex_program_compiler
* compiler
)
408 struct rc_instruction
*inst
;
409 unsigned int num_orig_temps
= 0;
410 char hwtemps
[R300_VS_MAX_TEMPS
];
411 struct temporary_allocation
* ta
;
414 compiler
->code
->num_temporaries
= 0;
415 memset(hwtemps
, 0, sizeof(hwtemps
));
417 /* Pass 1: Count original temporaries and allocate structures */
418 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
419 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
421 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
422 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
423 if (inst
->U
.I
.SrcReg
[i
].Index
>= num_orig_temps
)
424 num_orig_temps
= inst
->U
.I
.SrcReg
[i
].Index
+ 1;
428 if (opcode
->HasDstReg
) {
429 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
430 if (inst
->U
.I
.DstReg
.Index
>= num_orig_temps
)
431 num_orig_temps
= inst
->U
.I
.DstReg
.Index
+ 1;
436 ta
= (struct temporary_allocation
*)memory_pool_malloc(&compiler
->Base
.Pool
,
437 sizeof(struct temporary_allocation
) * num_orig_temps
);
438 memset(ta
, 0, sizeof(struct temporary_allocation
) * num_orig_temps
);
440 /* Pass 2: Determine original temporary lifetimes */
441 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
442 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
444 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
445 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
)
446 ta
[inst
->U
.I
.SrcReg
[i
].Index
].LastRead
= inst
;
450 /* Pass 3: Register allocation */
451 for(inst
= compiler
->Base
.Program
.Instructions
.Next
; inst
!= &compiler
->Base
.Program
.Instructions
; inst
= inst
->Next
) {
452 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
454 for (i
= 0; i
< opcode
->NumSrcRegs
; ++i
) {
455 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
) {
456 unsigned int orig
= inst
->U
.I
.SrcReg
[i
].Index
;
457 inst
->U
.I
.SrcReg
[i
].Index
= ta
[orig
].HwTemp
;
459 if (ta
[orig
].Allocated
&& inst
== ta
[orig
].LastRead
)
460 hwtemps
[ta
[orig
].HwTemp
] = 0;
464 if (opcode
->HasDstReg
) {
465 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
) {
466 unsigned int orig
= inst
->U
.I
.DstReg
.Index
;
468 if (!ta
[orig
].Allocated
) {
469 for(j
= 0; j
< R300_VS_MAX_TEMPS
; ++j
) {
473 if (j
>= R300_VS_MAX_TEMPS
) {
474 fprintf(stderr
, "Out of hw temporaries\n");
476 ta
[orig
].Allocated
= 1;
480 if (j
>= compiler
->code
->num_temporaries
)
481 compiler
->code
->num_temporaries
= j
+ 1;
485 inst
->U
.I
.DstReg
.Index
= ta
[orig
].HwTemp
;
492 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
493 * and the Saturate opcode modifier. Only Absolute is currently transformed.
495 static int transform_nonnative_modifiers(
496 struct radeon_compiler
*c
,
497 struct rc_instruction
*inst
,
500 const struct rc_opcode_info
*opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
503 /* Transform ABS(a) to MAX(a, -a). */
504 for (i
= 0; i
< opcode
->NumSrcRegs
; i
++) {
505 if (inst
->U
.I
.SrcReg
[i
].Abs
) {
506 struct rc_instruction
*new_inst
;
509 inst
->U
.I
.SrcReg
[i
].Abs
= 0;
511 temp
= rc_find_free_temporary(c
);
513 new_inst
= rc_insert_new_instruction(c
, inst
->Prev
);
514 new_inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
515 new_inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
516 new_inst
->U
.I
.DstReg
.Index
= temp
;
517 new_inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[i
];
518 new_inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[i
];
519 new_inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
521 memset(&inst
->U
.I
.SrcReg
[i
], 0, sizeof(inst
->U
.I
.SrcReg
[i
]));
522 inst
->U
.I
.SrcReg
[i
].File
= RC_FILE_TEMPORARY
;
523 inst
->U
.I
.SrcReg
[i
].Index
= temp
;
524 inst
->U
.I
.SrcReg
[i
].Swizzle
= RC_SWIZZLE_XYZW
;
531 * Vertex engine cannot read two inputs or two constants at the same time.
532 * Introduce intermediate MOVs to temporary registers to account for this.
534 static int transform_source_conflicts(
535 struct radeon_compiler
*c
,
536 struct rc_instruction
* inst
,
539 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
541 if (opcode
->NumSrcRegs
== 3) {
542 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2])
543 || t_src_conflict(inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2])) {
544 int tmpreg
= rc_find_free_temporary(c
);
545 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
546 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
547 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
548 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
549 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
551 reset_srcreg(&inst
->U
.I
.SrcReg
[2]);
552 inst
->U
.I
.SrcReg
[2].File
= RC_FILE_TEMPORARY
;
553 inst
->U
.I
.SrcReg
[2].Index
= tmpreg
;
557 if (opcode
->NumSrcRegs
>= 2) {
558 if (t_src_conflict(inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[0])) {
559 int tmpreg
= rc_find_free_temporary(c
);
560 struct rc_instruction
* inst_mov
= rc_insert_new_instruction(c
, inst
->Prev
);
561 inst_mov
->U
.I
.Opcode
= RC_OPCODE_MOV
;
562 inst_mov
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
563 inst_mov
->U
.I
.DstReg
.Index
= tmpreg
;
564 inst_mov
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
566 reset_srcreg(&inst
->U
.I
.SrcReg
[1]);
567 inst
->U
.I
.SrcReg
[1].File
= RC_FILE_TEMPORARY
;
568 inst
->U
.I
.SrcReg
[1].Index
= tmpreg
;
575 static void addArtificialOutputs(struct r300_vertex_program_compiler
* compiler
)
579 for(i
= 0; i
< 32; ++i
) {
580 if ((compiler
->RequiredOutputs
& (1 << i
)) &&
581 !(compiler
->Base
.Program
.OutputsWritten
& (1 << i
))) {
582 struct rc_instruction
* inst
= rc_insert_new_instruction(&compiler
->Base
, compiler
->Base
.Program
.Instructions
.Prev
);
583 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
585 inst
->U
.I
.DstReg
.File
= RC_FILE_OUTPUT
;
586 inst
->U
.I
.DstReg
.Index
= i
;
587 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
589 inst
->U
.I
.SrcReg
[0].File
= RC_FILE_CONSTANT
;
590 inst
->U
.I
.SrcReg
[0].Index
= 0;
591 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_XYZW
;
593 compiler
->Base
.Program
.OutputsWritten
|= 1 << i
;
598 static void dataflow_outputs_mark_used(void * userdata
, void * data
,
599 void (*callback
)(void *, unsigned int, unsigned int))
601 struct r300_vertex_program_compiler
* c
= userdata
;
604 for(i
= 0; i
< 32; ++i
) {
605 if (c
->RequiredOutputs
& (1 << i
))
606 callback(data
, i
, RC_MASK_XYZW
);
610 static int swizzle_is_native(rc_opcode opcode
, struct rc_src_register reg
)
618 static void debug_program_log(struct r300_vertex_program_compiler
* c
, const char * where
)
621 fprintf(stderr
, "Vertex Program: %s\n", where
);
622 rc_print_program(&c
->Base
.Program
);
627 static struct rc_swizzle_caps r300_vertprog_swizzle_caps
= {
628 .IsNative
= &swizzle_is_native
,
629 .Split
= 0 /* should never be called */
633 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler
* compiler
)
635 struct emulate_loop_state loop_state
;
637 compiler
->Base
.SwizzleCaps
= &r300_vertprog_swizzle_caps
;
639 addArtificialOutputs(compiler
);
641 debug_program_log(compiler
, "before compilation");
643 /* XXX Ideally this should be done only for r3xx, but since
644 * we don't have branching support for r5xx, we use the emulation
645 * on all chipsets. */
646 rc_transform_unroll_loops(&compiler
->Base
, &loop_state
);
648 debug_program_log(compiler
, "after transform loops");
650 if (compiler
->Base
.is_r500
){
651 rc_emulate_loops(&loop_state
, R500_VS_MAX_ALU
);
653 rc_emulate_loops(&loop_state
, R300_VS_MAX_ALU
);
655 debug_program_log(compiler
, "after emulate loops");
657 rc_emulate_branches(&compiler
->Base
);
659 debug_program_log(compiler
, "after emulate branches");
661 if (compiler
->Base
.is_r500
) {
662 struct radeon_program_transformation transformations
[] = {
663 { &r300_transform_vertex_alu
, 0 },
664 { &r300_transform_trig_scale_vertex
, 0 }
666 radeonLocalTransform(&compiler
->Base
, 2, transformations
);
668 debug_program_log(compiler
, "after native rewrite");
670 struct radeon_program_transformation transformations
[] = {
671 { &r300_transform_vertex_alu
, 0 },
672 { &radeonTransformTrigSimple
, 0 }
674 radeonLocalTransform(&compiler
->Base
, 2, transformations
);
676 debug_program_log(compiler
, "after native rewrite");
678 /* Note: This pass has to be done seperately from ALU rewrite,
679 * because it needs to check every instruction.
681 struct radeon_program_transformation transformations2
[] = {
682 { &transform_nonnative_modifiers
, 0 },
684 radeonLocalTransform(&compiler
->Base
, 1, transformations2
);
686 debug_program_log(compiler
, "after emulate modifiers");
690 /* Note: This pass has to be done seperately from ALU rewrite,
691 * otherwise non-native ALU instructions with source conflits
692 * will not be treated properly.
694 struct radeon_program_transformation transformations
[] = {
695 { &transform_source_conflicts
, 0 },
697 radeonLocalTransform(&compiler
->Base
, 1, transformations
);
700 debug_program_log(compiler
, "after source conflict resolve");
702 rc_dataflow_deadcode(&compiler
->Base
, &dataflow_outputs_mark_used
, compiler
);
704 debug_program_log(compiler
, "after deadcode");
706 rc_dataflow_swizzles(&compiler
->Base
);
708 allocate_temporary_registers(compiler
);
710 debug_program_log(compiler
, "after dataflow");
712 translate_vertex_program(compiler
);
714 rc_constants_copy(&compiler
->code
->constants
, &compiler
->Base
.Program
.Constants
);
716 compiler
->code
->InputsRead
= compiler
->Base
.Program
.InputsRead
;
717 compiler
->code
->OutputsWritten
= compiler
->Base
.Program
.OutputsWritten
;
719 if (compiler
->Base
.Debug
) {
720 fprintf(stderr
, "Final vertex program code:\n");
721 r300_vertex_program_dump(compiler
->code
);