r300/compiler: fix swizzling in the transformation of Abs modifiers
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
1 /*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_dataflow.h"
30 #include "radeon_program_alu.h"
31 #include "radeon_swizzle.h"
32 #include "radeon_emulate_branches.h"
33 #include "radeon_emulate_loops.h"
34
35 /*
36 * Take an already-setup and valid source then swizzle it appropriately to
37 * obtain a constant ZERO or ONE source.
38 */
39 #define __CONST(x, y) \
40 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
41 t_swizzle(y), \
42 t_swizzle(y), \
43 t_swizzle(y), \
44 t_swizzle(y), \
45 t_src_class(vpi->SrcReg[x].File), \
46 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
47
48
49 static unsigned long t_dst_mask(unsigned int mask)
50 {
51 /* RC_MASK_* is equivalent to VSF_FLAG_* */
52 return mask & RC_MASK_XYZW;
53 }
54
55 static unsigned long t_dst_class(rc_register_file file)
56 {
57 switch (file) {
58 default:
59 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
60 /* fall-through */
61 case RC_FILE_TEMPORARY:
62 return PVS_DST_REG_TEMPORARY;
63 case RC_FILE_OUTPUT:
64 return PVS_DST_REG_OUT;
65 case RC_FILE_ADDRESS:
66 return PVS_DST_REG_A0;
67 }
68 }
69
70 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
71 struct rc_dst_register *dst)
72 {
73 if (dst->File == RC_FILE_OUTPUT)
74 return vp->outputs[dst->Index];
75
76 return dst->Index;
77 }
78
79 static unsigned long t_src_class(rc_register_file file)
80 {
81 switch (file) {
82 default:
83 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
84 /* fall-through */
85 case RC_FILE_NONE:
86 case RC_FILE_TEMPORARY:
87 return PVS_SRC_REG_TEMPORARY;
88 case RC_FILE_INPUT:
89 return PVS_SRC_REG_INPUT;
90 case RC_FILE_CONSTANT:
91 return PVS_SRC_REG_CONSTANT;
92 }
93 }
94
95 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
96 {
97 unsigned long aclass = t_src_class(a.File);
98 unsigned long bclass = t_src_class(b.File);
99
100 if (aclass != bclass)
101 return 0;
102 if (aclass == PVS_SRC_REG_TEMPORARY)
103 return 0;
104
105 if (a.RelAddr || b.RelAddr)
106 return 1;
107 if (a.Index != b.Index)
108 return 1;
109
110 return 0;
111 }
112
113 static inline unsigned long t_swizzle(unsigned int swizzle)
114 {
115 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
116 return swizzle;
117 }
118
119 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
120 struct rc_src_register *src)
121 {
122 if (src->File == RC_FILE_INPUT) {
123 assert(vp->inputs[src->Index] != -1);
124 return vp->inputs[src->Index];
125 } else {
126 if (src->Index < 0) {
127 fprintf(stderr,
128 "negative offsets for indirect addressing do not work.\n");
129 return 0;
130 }
131 return src->Index;
132 }
133 }
134
135 /* these two functions should probably be merged... */
136
137 static unsigned long t_src(struct r300_vertex_program_code *vp,
138 struct rc_src_register *src)
139 {
140 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
141 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
142 */
143 return PVS_SRC_OPERAND(t_src_index(vp, src),
144 t_swizzle(GET_SWZ(src->Swizzle, 0)),
145 t_swizzle(GET_SWZ(src->Swizzle, 1)),
146 t_swizzle(GET_SWZ(src->Swizzle, 2)),
147 t_swizzle(GET_SWZ(src->Swizzle, 3)),
148 t_src_class(src->File),
149 src->Negate) |
150 (src->RelAddr << 4) | (src->Abs << 3);
151 }
152
153 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
154 struct rc_src_register *src)
155 {
156 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
157 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
158 */
159 return PVS_SRC_OPERAND(t_src_index(vp, src),
160 t_swizzle(GET_SWZ(src->Swizzle, 0)),
161 t_swizzle(GET_SWZ(src->Swizzle, 0)),
162 t_swizzle(GET_SWZ(src->Swizzle, 0)),
163 t_swizzle(GET_SWZ(src->Swizzle, 0)),
164 t_src_class(src->File),
165 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
166 (src->RelAddr << 4) | (src->Abs << 3);
167 }
168
169 static int valid_dst(struct r300_vertex_program_code *vp,
170 struct rc_dst_register *dst)
171 {
172 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
173 return 0;
174 } else if (dst->File == RC_FILE_ADDRESS) {
175 assert(dst->Index == 0);
176 }
177
178 return 1;
179 }
180
181 static void ei_vector1(struct r300_vertex_program_code *vp,
182 unsigned int hw_opcode,
183 struct rc_sub_instruction *vpi,
184 unsigned int * inst)
185 {
186 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
187 0,
188 0,
189 t_dst_index(vp, &vpi->DstReg),
190 t_dst_mask(vpi->DstReg.WriteMask),
191 t_dst_class(vpi->DstReg.File));
192 inst[1] = t_src(vp, &vpi->SrcReg[0]);
193 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
194 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
195 }
196
197 static void ei_vector2(struct r300_vertex_program_code *vp,
198 unsigned int hw_opcode,
199 struct rc_sub_instruction *vpi,
200 unsigned int * inst)
201 {
202 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
203 0,
204 0,
205 t_dst_index(vp, &vpi->DstReg),
206 t_dst_mask(vpi->DstReg.WriteMask),
207 t_dst_class(vpi->DstReg.File));
208 inst[1] = t_src(vp, &vpi->SrcReg[0]);
209 inst[2] = t_src(vp, &vpi->SrcReg[1]);
210 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
211 }
212
213 static void ei_math1(struct r300_vertex_program_code *vp,
214 unsigned int hw_opcode,
215 struct rc_sub_instruction *vpi,
216 unsigned int * inst)
217 {
218 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
219 1,
220 0,
221 t_dst_index(vp, &vpi->DstReg),
222 t_dst_mask(vpi->DstReg.WriteMask),
223 t_dst_class(vpi->DstReg.File));
224 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
225 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
226 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
227 }
228
229 static void ei_lit(struct r300_vertex_program_code *vp,
230 struct rc_sub_instruction *vpi,
231 unsigned int * inst)
232 {
233 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
234
235 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
236 1,
237 0,
238 t_dst_index(vp, &vpi->DstReg),
239 t_dst_mask(vpi->DstReg.WriteMask),
240 t_dst_class(vpi->DstReg.File));
241 /* NOTE: Users swizzling might not work. */
242 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
243 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
244 PVS_SRC_SELECT_FORCE_0, // Z
245 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
246 t_src_class(vpi->SrcReg[0].File),
247 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
248 (vpi->SrcReg[0].RelAddr << 4);
249 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
250 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
251 PVS_SRC_SELECT_FORCE_0, // Z
252 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
253 t_src_class(vpi->SrcReg[0].File),
254 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
255 (vpi->SrcReg[0].RelAddr << 4);
256 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
258 PVS_SRC_SELECT_FORCE_0, // Z
259 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
260 t_src_class(vpi->SrcReg[0].File),
261 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
262 (vpi->SrcReg[0].RelAddr << 4);
263 }
264
265 static void ei_mad(struct r300_vertex_program_code *vp,
266 struct rc_sub_instruction *vpi,
267 unsigned int * inst)
268 {
269 /* Remarks about hardware limitations of MAD
270 * (please preserve this comment, as this information is _NOT_
271 * in the documentation provided by AMD).
272 *
273 * As described in the documentation, MAD with three unique temporary
274 * source registers requires the use of the macro version.
275 *
276 * However (and this is not mentioned in the documentation), apparently
277 * the macro version is _NOT_ a full superset of the normal version.
278 * In particular, the macro version does not always work when relative
279 * addressing is used in the source operands.
280 *
281 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
282 * assembly shader path when using medium quality animations
283 * (i.e. animations with matrix blending instead of quaternion blending).
284 *
285 * Unfortunately, I (nha) have been unable to extract a Piglit regression
286 * test for this issue - for some reason, it is possible to have vertex
287 * programs whose prefix is *exactly* the same as the prefix of the
288 * offending program in Sauerbraten up to the offending instruction
289 * without causing any trouble.
290 *
291 * Bottom line: Only use the macro version only when really necessary;
292 * according to AMD docs, this should improve performance by one clock
293 * as a nice side bonus.
294 */
295 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
296 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
297 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
298 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
299 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
300 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
301 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
302 0,
303 1,
304 t_dst_index(vp, &vpi->DstReg),
305 t_dst_mask(vpi->DstReg.WriteMask),
306 t_dst_class(vpi->DstReg.File));
307 } else {
308 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
309 0,
310 0,
311 t_dst_index(vp, &vpi->DstReg),
312 t_dst_mask(vpi->DstReg.WriteMask),
313 t_dst_class(vpi->DstReg.File));
314 }
315 inst[1] = t_src(vp, &vpi->SrcReg[0]);
316 inst[2] = t_src(vp, &vpi->SrcReg[1]);
317 inst[3] = t_src(vp, &vpi->SrcReg[2]);
318 }
319
320 static void ei_pow(struct r300_vertex_program_code *vp,
321 struct rc_sub_instruction *vpi,
322 unsigned int * inst)
323 {
324 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
325 1,
326 0,
327 t_dst_index(vp, &vpi->DstReg),
328 t_dst_mask(vpi->DstReg.WriteMask),
329 t_dst_class(vpi->DstReg.File));
330 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
331 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
332 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
333 }
334
335
336 static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
337 {
338 struct rc_instruction *rci;
339
340 compiler->code->pos_end = 0; /* Not supported yet */
341 compiler->code->length = 0;
342
343 compiler->SetHwInputOutput(compiler);
344
345 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
346 struct rc_sub_instruction *vpi = &rci->U.I;
347 unsigned int *inst = compiler->code->body.d + compiler->code->length;
348
349 /* Skip instructions writing to non-existing destination */
350 if (!valid_dst(compiler->code, &vpi->DstReg))
351 continue;
352
353 if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS ||
354 (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) {
355 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
356 return;
357 }
358
359 assert(compiler->Base.is_r500 ||
360 (vpi->Opcode != RC_OPCODE_SEQ &&
361 vpi->Opcode != RC_OPCODE_SNE));
362
363 switch (vpi->Opcode) {
364 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
365 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
366 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
367 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
368 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
369 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
370 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
371 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
372 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
373 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
374 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
375 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
376 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
377 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
378 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
379 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
380 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
381 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
382 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
383 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
384 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
385 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
386 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
387 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
388 default:
389 rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
390 return;
391 }
392
393 compiler->code->length += 4;
394
395 if (compiler->Base.Error)
396 return;
397 }
398 }
399
400 struct temporary_allocation {
401 unsigned int Allocated:1;
402 unsigned int HwTemp:15;
403 struct rc_instruction * LastRead;
404 };
405
406 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
407 {
408 struct rc_instruction *inst;
409 unsigned int num_orig_temps = 0;
410 char hwtemps[R300_VS_MAX_TEMPS];
411 struct temporary_allocation * ta;
412 unsigned int i, j;
413
414 compiler->code->num_temporaries = 0;
415 memset(hwtemps, 0, sizeof(hwtemps));
416
417 /* Pass 1: Count original temporaries and allocate structures */
418 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
419 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
420
421 for (i = 0; i < opcode->NumSrcRegs; ++i) {
422 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
423 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
424 num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
425 }
426 }
427
428 if (opcode->HasDstReg) {
429 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
430 if (inst->U.I.DstReg.Index >= num_orig_temps)
431 num_orig_temps = inst->U.I.DstReg.Index + 1;
432 }
433 }
434 }
435
436 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
437 sizeof(struct temporary_allocation) * num_orig_temps);
438 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
439
440 /* Pass 2: Determine original temporary lifetimes */
441 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
442 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
443
444 for (i = 0; i < opcode->NumSrcRegs; ++i) {
445 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
446 ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
447 }
448 }
449
450 /* Pass 3: Register allocation */
451 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
452 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
453
454 for (i = 0; i < opcode->NumSrcRegs; ++i) {
455 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
456 unsigned int orig = inst->U.I.SrcReg[i].Index;
457 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
458
459 if (ta[orig].Allocated && inst == ta[orig].LastRead)
460 hwtemps[ta[orig].HwTemp] = 0;
461 }
462 }
463
464 if (opcode->HasDstReg) {
465 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
466 unsigned int orig = inst->U.I.DstReg.Index;
467
468 if (!ta[orig].Allocated) {
469 for(j = 0; j < R300_VS_MAX_TEMPS; ++j) {
470 if (!hwtemps[j])
471 break;
472 }
473 if (j >= R300_VS_MAX_TEMPS) {
474 fprintf(stderr, "Out of hw temporaries\n");
475 } else {
476 ta[orig].Allocated = 1;
477 ta[orig].HwTemp = j;
478 hwtemps[j] = 1;
479
480 if (j >= compiler->code->num_temporaries)
481 compiler->code->num_temporaries = j + 1;
482 }
483 }
484
485 inst->U.I.DstReg.Index = ta[orig].HwTemp;
486 }
487 }
488 }
489 }
490
491 /**
492 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
493 * and the Saturate opcode modifier. Only Absolute is currently transformed.
494 */
495 static int transform_nonnative_modifiers(
496 struct radeon_compiler *c,
497 struct rc_instruction *inst,
498 void* unused)
499 {
500 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
501 unsigned i;
502
503 /* Transform ABS(a) to MAX(a, -a). */
504 for (i = 0; i < opcode->NumSrcRegs; i++) {
505 if (inst->U.I.SrcReg[i].Abs) {
506 struct rc_instruction *new_inst;
507 unsigned temp;
508
509 inst->U.I.SrcReg[i].Abs = 0;
510
511 temp = rc_find_free_temporary(c);
512
513 new_inst = rc_insert_new_instruction(c, inst->Prev);
514 new_inst->U.I.Opcode = RC_OPCODE_MAX;
515 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
516 new_inst->U.I.DstReg.Index = temp;
517 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
518 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
519 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
520
521 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
522 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
523 inst->U.I.SrcReg[i].Index = temp;
524 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
525 }
526 }
527 return 1;
528 }
529
530 /**
531 * Vertex engine cannot read two inputs or two constants at the same time.
532 * Introduce intermediate MOVs to temporary registers to account for this.
533 */
534 static int transform_source_conflicts(
535 struct radeon_compiler *c,
536 struct rc_instruction* inst,
537 void* unused)
538 {
539 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
540
541 if (opcode->NumSrcRegs == 3) {
542 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
543 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
544 int tmpreg = rc_find_free_temporary(c);
545 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
546 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
547 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
548 inst_mov->U.I.DstReg.Index = tmpreg;
549 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
550
551 reset_srcreg(&inst->U.I.SrcReg[2]);
552 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
553 inst->U.I.SrcReg[2].Index = tmpreg;
554 }
555 }
556
557 if (opcode->NumSrcRegs >= 2) {
558 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
559 int tmpreg = rc_find_free_temporary(c);
560 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
561 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
562 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
563 inst_mov->U.I.DstReg.Index = tmpreg;
564 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
565
566 reset_srcreg(&inst->U.I.SrcReg[1]);
567 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
568 inst->U.I.SrcReg[1].Index = tmpreg;
569 }
570 }
571
572 return 1;
573 }
574
575 static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
576 {
577 int i;
578
579 for(i = 0; i < 32; ++i) {
580 if ((compiler->RequiredOutputs & (1 << i)) &&
581 !(compiler->Base.Program.OutputsWritten & (1 << i))) {
582 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
583 inst->U.I.Opcode = RC_OPCODE_MOV;
584
585 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
586 inst->U.I.DstReg.Index = i;
587 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
588
589 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
590 inst->U.I.SrcReg[0].Index = 0;
591 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
592
593 compiler->Base.Program.OutputsWritten |= 1 << i;
594 }
595 }
596 }
597
598 static void dataflow_outputs_mark_used(void * userdata, void * data,
599 void (*callback)(void *, unsigned int, unsigned int))
600 {
601 struct r300_vertex_program_compiler * c = userdata;
602 int i;
603
604 for(i = 0; i < 32; ++i) {
605 if (c->RequiredOutputs & (1 << i))
606 callback(data, i, RC_MASK_XYZW);
607 }
608 }
609
610 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
611 {
612 (void) opcode;
613 (void) reg;
614
615 return 1;
616 }
617
618 static void debug_program_log(struct r300_vertex_program_compiler* c, const char * where)
619 {
620 if (c->Base.Debug) {
621 fprintf(stderr, "Vertex Program: %s\n", where);
622 rc_print_program(&c->Base.Program);
623 }
624 }
625
626
627 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
628 .IsNative = &swizzle_is_native,
629 .Split = 0 /* should never be called */
630 };
631
632
633 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
634 {
635 struct emulate_loop_state loop_state;
636
637 compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
638
639 addArtificialOutputs(compiler);
640
641 debug_program_log(compiler, "before compilation");
642
643 /* XXX Ideally this should be done only for r3xx, but since
644 * we don't have branching support for r5xx, we use the emulation
645 * on all chipsets. */
646 rc_transform_unroll_loops(&compiler->Base, &loop_state);
647
648 debug_program_log(compiler, "after transform loops");
649
650 if (compiler->Base.is_r500){
651 rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
652 } else {
653 rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
654 }
655 debug_program_log(compiler, "after emulate loops");
656
657 rc_emulate_branches(&compiler->Base);
658
659 debug_program_log(compiler, "after emulate branches");
660
661 if (compiler->Base.is_r500) {
662 struct radeon_program_transformation transformations[] = {
663 { &r300_transform_vertex_alu, 0 },
664 { &r300_transform_trig_scale_vertex, 0 }
665 };
666 radeonLocalTransform(&compiler->Base, 2, transformations);
667
668 debug_program_log(compiler, "after native rewrite");
669 } else {
670 struct radeon_program_transformation transformations[] = {
671 { &r300_transform_vertex_alu, 0 },
672 { &radeonTransformTrigSimple, 0 }
673 };
674 radeonLocalTransform(&compiler->Base, 2, transformations);
675
676 debug_program_log(compiler, "after native rewrite");
677
678 /* Note: This pass has to be done seperately from ALU rewrite,
679 * because it needs to check every instruction.
680 */
681 struct radeon_program_transformation transformations2[] = {
682 { &transform_nonnative_modifiers, 0 },
683 };
684 radeonLocalTransform(&compiler->Base, 1, transformations2);
685
686 debug_program_log(compiler, "after emulate modifiers");
687 }
688
689 {
690 /* Note: This pass has to be done seperately from ALU rewrite,
691 * otherwise non-native ALU instructions with source conflits
692 * will not be treated properly.
693 */
694 struct radeon_program_transformation transformations[] = {
695 { &transform_source_conflicts, 0 },
696 };
697 radeonLocalTransform(&compiler->Base, 1, transformations);
698 }
699
700 debug_program_log(compiler, "after source conflict resolve");
701
702 rc_dataflow_deadcode(&compiler->Base, &dataflow_outputs_mark_used, compiler);
703
704 debug_program_log(compiler, "after deadcode");
705
706 rc_dataflow_swizzles(&compiler->Base);
707
708 allocate_temporary_registers(compiler);
709
710 debug_program_log(compiler, "after dataflow");
711
712 translate_vertex_program(compiler);
713
714 rc_constants_copy(&compiler->code->constants, &compiler->Base.Program.Constants);
715
716 compiler->code->InputsRead = compiler->Base.Program.InputsRead;
717 compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
718
719 if (compiler->Base.Debug) {
720 fprintf(stderr, "Final vertex program code:\n");
721 r300_vertex_program_dump(compiler->code);
722 }
723 }