r300: Remove GLcontext requirement from radeon_program_pair
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_program_pair.c
1 /*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * @file
30 *
31 * Perform temporary register allocation and attempt to pair off instructions
32 * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction
33 * vs. ALU instruction scheduling.
34 */
35
36 #include "radeon_program_pair.h"
37
38 #include "radeon_common.h"
39
40 #include "memory_pool.h"
41 #include "shader/prog_print.h"
42
43 #define error(fmt, args...) do { \
44 fprintf(stderr, "r300 driver problem: %s::%s(): " fmt "\n", \
45 __FILE__, __FUNCTION__, ##args); \
46 s->Error = GL_TRUE; \
47 } while(0)
48
49 struct pair_state_instruction {
50 struct prog_instruction Instruction;
51 GLuint IP; /**< Position of this instruction in original program */
52
53 GLuint IsTex:1; /**< Is a texture instruction */
54 GLuint NeedRGB:1; /**< Needs the RGB ALU */
55 GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
56 GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
57
58 /**
59 * Number of (read and write) dependencies that must be resolved before
60 * this instruction can be scheduled.
61 */
62 GLuint NumDependencies:5;
63
64 /**
65 * Next instruction in the linked list of ready instructions.
66 */
67 struct pair_state_instruction *NextReady;
68
69 /**
70 * Values that this instruction writes
71 */
72 struct reg_value *Values[4];
73 };
74
75
76 /**
77 * Used to keep track of which instructions read a value.
78 */
79 struct reg_value_reader {
80 struct pair_state_instruction *Reader;
81 struct reg_value_reader *Next;
82 };
83
84 /**
85 * Used to keep track which values are stored in each component of a
86 * PROGRAM_TEMPORARY.
87 */
88 struct reg_value {
89 struct pair_state_instruction *Writer;
90 struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */
91
92 /**
93 * Unordered linked list of instructions that read from this value.
94 */
95 struct reg_value_reader *Readers;
96
97 /**
98 * Number of readers of this value. This is calculated during @ref scan_instructions
99 * and continually decremented during code emission.
100 * When this count reaches zero, the instruction that writes the @ref Next value
101 * can be scheduled.
102 */
103 GLuint NumReaders;
104 };
105
106 /**
107 * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register
108 * to the proper hardware temporary.
109 */
110 struct pair_register_translation {
111 GLuint Allocated:1;
112 GLuint HwIndex:8;
113 GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */
114
115 /**
116 * Notes the value that is currently contained in each component
117 * (only used for PROGRAM_TEMPORARY registers).
118 */
119 struct reg_value *Value[4];
120 };
121
122 struct pair_state {
123 struct memory_pool Pool;
124 struct gl_program *Program;
125 const struct radeon_pair_handler *Handler;
126 GLboolean Error;
127 GLboolean Debug;
128 GLboolean Verbose;
129 void *UserData;
130
131 /**
132 * Translate Mesa registers to hardware registers
133 */
134 struct pair_register_translation Inputs[FRAG_ATTRIB_MAX];
135 struct pair_register_translation Temps[MAX_PROGRAM_TEMPS];
136
137 struct {
138 GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */
139 } HwTemps[128];
140
141 /**
142 * Linked list of instructions that can be scheduled right now,
143 * based on which ALU/TEX resources they require.
144 */
145 struct pair_state_instruction *ReadyFullALU;
146 struct pair_state_instruction *ReadyRGB;
147 struct pair_state_instruction *ReadyAlpha;
148 struct pair_state_instruction *ReadyTEX;
149 };
150
151
152 static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index)
153 {
154 switch(file) {
155 case PROGRAM_TEMPORARY: return &s->Temps[index];
156 case PROGRAM_INPUT: return &s->Inputs[index];
157 default: return 0;
158 }
159 }
160
161 static void alloc_hw_reg(struct pair_state *s, GLuint file, GLuint index, GLuint hwindex)
162 {
163 struct pair_register_translation *t = get_register(s, file, index);
164 ASSERT(!s->HwTemps[hwindex].RefCount);
165 ASSERT(!t->Allocated);
166 s->HwTemps[hwindex].RefCount = t->RefCount;
167 t->Allocated = 1;
168 t->HwIndex = hwindex;
169 }
170
171 static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index)
172 {
173 GLuint hwindex;
174
175 struct pair_register_translation *t = get_register(s, file, index);
176 if (!t) {
177 error("get_hw_reg: %i[%i]\n", file, index);
178 return 0;
179 }
180
181 if (t->Allocated)
182 return t->HwIndex;
183
184 for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex)
185 if (!s->HwTemps[hwindex].RefCount)
186 break;
187
188 if (hwindex >= s->Handler->MaxHwTemps) {
189 error("Ran out of hardware temporaries");
190 return 0;
191 }
192
193 alloc_hw_reg(s, file, index, hwindex);
194 return hwindex;
195 }
196
197
198 static void deref_hw_reg(struct pair_state *s, GLuint hwindex)
199 {
200 if (!s->HwTemps[hwindex].RefCount) {
201 error("Hwindex %i refcount error", hwindex);
202 return;
203 }
204
205 s->HwTemps[hwindex].RefCount--;
206 }
207
208 static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst)
209 {
210 pairinst->NextReady = *list;
211 *list = pairinst;
212 }
213
214 /**
215 * The given instruction has become ready. Link it into the ready
216 * instructions.
217 */
218 static void instruction_ready(struct pair_state *s, struct pair_state_instruction *pairinst)
219 {
220 if (s->Verbose)
221 _mesa_printf("instruction_ready(%i)\n", pairinst->IP);
222
223 if (pairinst->IsTex)
224 add_pairinst_to_list(&s->ReadyTEX, pairinst);
225 else if (!pairinst->NeedAlpha)
226 add_pairinst_to_list(&s->ReadyRGB, pairinst);
227 else if (!pairinst->NeedRGB)
228 add_pairinst_to_list(&s->ReadyAlpha, pairinst);
229 else
230 add_pairinst_to_list(&s->ReadyFullALU, pairinst);
231 }
232
233
234 /**
235 * Finally rewrite ADD, MOV, MUL as the appropriate native instruction
236 * and reverse the order of arguments for CMP.
237 */
238 static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
239 {
240 struct prog_src_register tmp;
241
242 switch(inst->Opcode) {
243 case OPCODE_ADD:
244 inst->SrcReg[2] = inst->SrcReg[1];
245 inst->SrcReg[1].File = PROGRAM_BUILTIN;
246 inst->SrcReg[1].Swizzle = SWIZZLE_1111;
247 inst->SrcReg[1].Negate = NEGATE_NONE;
248 inst->Opcode = OPCODE_MAD;
249 break;
250 case OPCODE_CMP:
251 tmp = inst->SrcReg[2];
252 inst->SrcReg[2] = inst->SrcReg[0];
253 inst->SrcReg[0] = tmp;
254 break;
255 case OPCODE_MOV:
256 /* AMD say we should use CMP.
257 * However, when we transform
258 * KIL -r0;
259 * into
260 * CMP tmp, -r0, -r0, 0;
261 * KIL tmp;
262 * we get incorrect behaviour on R500 when r0 == 0.0.
263 * It appears that the R500 KIL hardware treats -0.0 as less
264 * than zero.
265 */
266 inst->SrcReg[1].File = PROGRAM_BUILTIN;
267 inst->SrcReg[1].Swizzle = SWIZZLE_1111;
268 inst->SrcReg[2].File = PROGRAM_BUILTIN;
269 inst->SrcReg[2].Swizzle = SWIZZLE_0000;
270 inst->Opcode = OPCODE_MAD;
271 break;
272 case OPCODE_MUL:
273 inst->SrcReg[2].File = PROGRAM_BUILTIN;
274 inst->SrcReg[2].Swizzle = SWIZZLE_0000;
275 inst->Opcode = OPCODE_MAD;
276 break;
277 default:
278 /* nothing to do */
279 break;
280 }
281 }
282
283
284 /**
285 * Classify an instruction according to which ALUs etc. it needs
286 */
287 static void classify_instruction(struct pair_state *s,
288 struct pair_state_instruction *psi)
289 {
290 psi->NeedRGB = (psi->Instruction.DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
291 psi->NeedAlpha = (psi->Instruction.DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
292
293 switch(psi->Instruction.Opcode) {
294 case OPCODE_ADD:
295 case OPCODE_CMP:
296 case OPCODE_DDX:
297 case OPCODE_DDY:
298 case OPCODE_FRC:
299 case OPCODE_MAD:
300 case OPCODE_MAX:
301 case OPCODE_MIN:
302 case OPCODE_MOV:
303 case OPCODE_MUL:
304 break;
305 case OPCODE_COS:
306 case OPCODE_EX2:
307 case OPCODE_LG2:
308 case OPCODE_RCP:
309 case OPCODE_RSQ:
310 case OPCODE_SIN:
311 psi->IsTranscendent = 1;
312 psi->NeedAlpha = 1;
313 break;
314 case OPCODE_DP4:
315 psi->NeedAlpha = 1;
316 /* fall through */
317 case OPCODE_DP3:
318 psi->NeedRGB = 1;
319 break;
320 case OPCODE_KIL:
321 case OPCODE_TEX:
322 case OPCODE_TXB:
323 case OPCODE_TXP:
324 case OPCODE_END:
325 psi->IsTex = 1;
326 break;
327 default:
328 error("Unknown opcode %d\n", psi->Instruction.Opcode);
329 break;
330 }
331 }
332
333
334 /**
335 * Count which (input, temporary) register is read and written how often,
336 * and scan the instruction stream to find dependencies.
337 */
338 static void scan_instructions(struct pair_state *s)
339 {
340 struct prog_instruction *source;
341 GLuint ip;
342
343 for(source = s->Program->Instructions, ip = 0;
344 source->Opcode != OPCODE_END;
345 ++source, ++ip) {
346 struct pair_state_instruction *pairinst = memory_pool_malloc(&s->Pool, sizeof(*pairinst));
347 memset(pairinst, 0, sizeof(struct pair_state_instruction));
348
349 pairinst->Instruction = *source;
350 pairinst->IP = ip;
351 final_rewrite(s, &pairinst->Instruction);
352 classify_instruction(s, pairinst);
353
354 int nsrc = _mesa_num_inst_src_regs(pairinst->Instruction.Opcode);
355 int j;
356 for(j = 0; j < nsrc; j++) {
357 struct pair_register_translation *t =
358 get_register(s, pairinst->Instruction.SrcReg[j].File, pairinst->Instruction.SrcReg[j].Index);
359 if (!t)
360 continue;
361
362 t->RefCount++;
363
364 if (pairinst->Instruction.SrcReg[j].File == PROGRAM_TEMPORARY) {
365 int i;
366 for(i = 0; i < 4; ++i) {
367 GLuint swz = GET_SWZ(pairinst->Instruction.SrcReg[j].Swizzle, i);
368 if (swz >= 4)
369 continue; /* constant or NIL swizzle */
370 if (!t->Value[swz])
371 continue; /* this is an undefined read */
372
373 /* Do not add a dependency if this instruction
374 * also rewrites the value. The code below adds
375 * a dependency for the DstReg, which is a superset
376 * of the SrcReg dependency. */
377 if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY &&
378 pairinst->Instruction.DstReg.Index == pairinst->Instruction.SrcReg[j].Index &&
379 GET_BIT(pairinst->Instruction.DstReg.WriteMask, swz))
380 continue;
381
382 struct reg_value_reader* r = memory_pool_malloc(&s->Pool, sizeof(*r));
383 pairinst->NumDependencies++;
384 t->Value[swz]->NumReaders++;
385 r->Reader = pairinst;
386 r->Next = t->Value[swz]->Readers;
387 t->Value[swz]->Readers = r;
388 }
389 }
390 }
391
392 int ndst = _mesa_num_inst_dst_regs(pairinst->Instruction.Opcode);
393 if (ndst) {
394 struct pair_register_translation *t =
395 get_register(s, pairinst->Instruction.DstReg.File, pairinst->Instruction.DstReg.Index);
396 if (t) {
397 t->RefCount++;
398
399 if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY) {
400 int j;
401 for(j = 0; j < 4; ++j) {
402 if (!GET_BIT(pairinst->Instruction.DstReg.WriteMask, j))
403 continue;
404
405 struct reg_value* v = memory_pool_malloc(&s->Pool, sizeof(*v));
406 memset(v, 0, sizeof(struct reg_value));
407 v->Writer = pairinst;
408 if (t->Value[j]) {
409 pairinst->NumDependencies++;
410 t->Value[j]->Next = v;
411 }
412 t->Value[j] = v;
413 pairinst->Values[j] = v;
414 }
415 }
416 }
417 }
418
419 if (s->Verbose)
420 _mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies);
421
422 if (!pairinst->NumDependencies)
423 instruction_ready(s, pairinst);
424 }
425
426 /* Clear the PROGRAM_TEMPORARY state */
427 int i, j;
428 for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) {
429 for(j = 0; j < 4; ++j)
430 s->Temps[i].Value[j] = 0;
431 }
432 }
433
434
435 /**
436 * Reserve hardware temporary registers for the program inputs.
437 *
438 * @note This allocation is performed explicitly, because the order of inputs
439 * is determined by the RS hardware.
440 */
441 static void allocate_input_registers(struct pair_state *s)
442 {
443 GLuint InputsRead = s->Program->InputsRead;
444 int i;
445 GLuint hwindex = 0;
446
447 /* Primary colour */
448 if (InputsRead & FRAG_BIT_COL0)
449 alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0, hwindex++);
450 InputsRead &= ~FRAG_BIT_COL0;
451
452 /* Secondary color */
453 if (InputsRead & FRAG_BIT_COL1)
454 alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1, hwindex++);
455 InputsRead &= ~FRAG_BIT_COL1;
456
457 /* Texcoords */
458 for (i = 0; i < 8; i++) {
459 if (InputsRead & (FRAG_BIT_TEX0 << i))
460 alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i, hwindex++);
461 }
462 InputsRead &= ~FRAG_BITS_TEX_ANY;
463
464 /* Fogcoords treated as a texcoord */
465 if (InputsRead & FRAG_BIT_FOGC)
466 alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_FOGC, hwindex++);
467 InputsRead &= ~FRAG_BIT_FOGC;
468
469 /* fragment position treated as a texcoord */
470 if (InputsRead & FRAG_BIT_WPOS)
471 alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, hwindex++);
472 InputsRead &= ~FRAG_BIT_WPOS;
473
474 /* Anything else */
475 if (InputsRead)
476 error("Don't know how to handle inputs 0x%x\n", InputsRead);
477 }
478
479
480 static void decrement_dependencies(struct pair_state *s, struct pair_state_instruction *pairinst)
481 {
482 ASSERT(pairinst->NumDependencies > 0);
483 if (!--pairinst->NumDependencies)
484 instruction_ready(s, pairinst);
485 }
486
487 /**
488 * Update the dependency tracking state based on what the instruction
489 * at the given IP does.
490 */
491 static void commit_instruction(struct pair_state *s, struct pair_state_instruction *pairinst)
492 {
493 struct prog_instruction *inst = &pairinst->Instruction;
494
495 if (s->Verbose)
496 _mesa_printf("commit_instruction(%i)\n", pairinst->IP);
497
498 if (inst->DstReg.File == PROGRAM_TEMPORARY) {
499 struct pair_register_translation *t = &s->Temps[inst->DstReg.Index];
500 deref_hw_reg(s, t->HwIndex);
501
502 int i;
503 for(i = 0; i < 4; ++i) {
504 if (!GET_BIT(inst->DstReg.WriteMask, i))
505 continue;
506
507 t->Value[i] = pairinst->Values[i];
508 if (t->Value[i]->NumReaders) {
509 struct reg_value_reader *r;
510 for(r = pairinst->Values[i]->Readers; r; r = r->Next)
511 decrement_dependencies(s, r->Reader);
512 } else if (t->Value[i]->Next) {
513 /* This happens when the only reader writes
514 * the register at the same time */
515 decrement_dependencies(s, t->Value[i]->Next->Writer);
516 }
517 }
518 }
519
520 int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
521 int i;
522 for(i = 0; i < nsrc; i++) {
523 struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index);
524 if (!t)
525 continue;
526
527 deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index));
528
529 if (inst->SrcReg[i].File != PROGRAM_TEMPORARY)
530 continue;
531
532 int j;
533 for(j = 0; j < 4; ++j) {
534 GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
535 if (swz >= 4)
536 continue;
537 if (!t->Value[swz])
538 continue;
539
540 /* Do not free a dependency if this instruction
541 * also rewrites the value. See scan_instructions. */
542 if (inst->DstReg.File == PROGRAM_TEMPORARY &&
543 inst->DstReg.Index == inst->SrcReg[i].Index &&
544 GET_BIT(inst->DstReg.WriteMask, swz))
545 continue;
546
547 if (!--t->Value[swz]->NumReaders) {
548 if (t->Value[swz]->Next)
549 decrement_dependencies(s, t->Value[swz]->Next->Writer);
550 }
551 }
552 }
553 }
554
555
556 /**
557 * Emit all ready texture instructions in a single block.
558 *
559 * Emit as a single block to (hopefully) sample many textures in parallel,
560 * and to avoid hardware indirections on R300.
561 *
562 * In R500, we don't really know when the result of a texture instruction
563 * arrives. So allocate all destinations first, to make sure they do not
564 * arrive early and overwrite a texture coordinate we're going to use later
565 * in the block.
566 */
567 static void emit_all_tex(struct pair_state *s)
568 {
569 struct pair_state_instruction *readytex;
570 struct pair_state_instruction *pairinst;
571
572 ASSERT(s->ReadyTEX);
573
574 // Don't let the ready list change under us!
575 readytex = s->ReadyTEX;
576 s->ReadyTEX = 0;
577
578 // Allocate destination hardware registers in one block to avoid conflicts.
579 for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
580 struct prog_instruction *inst = &pairinst->Instruction;
581 if (inst->Opcode != OPCODE_KIL)
582 get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
583 }
584
585 if (s->Debug)
586 _mesa_printf(" BEGIN_TEX\n");
587
588 if (s->Handler->BeginTexBlock)
589 s->Error = s->Error || !s->Handler->BeginTexBlock(s->UserData);
590
591 for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
592 struct prog_instruction *inst = &pairinst->Instruction;
593 commit_instruction(s, pairinst);
594
595 if (inst->Opcode != OPCODE_KIL)
596 inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
597 inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
598
599 if (s->Debug) {
600 _mesa_printf(" ");
601 _mesa_print_instruction(inst);
602 fflush(stdout);
603 }
604
605 struct radeon_pair_texture_instruction rpti;
606
607 switch(inst->Opcode) {
608 case OPCODE_TEX: rpti.Opcode = RADEON_OPCODE_TEX; break;
609 case OPCODE_TXB: rpti.Opcode = RADEON_OPCODE_TXB; break;
610 case OPCODE_TXP: rpti.Opcode = RADEON_OPCODE_TXP; break;
611 default:
612 case OPCODE_KIL: rpti.Opcode = RADEON_OPCODE_KIL; break;
613 }
614
615 rpti.DestIndex = inst->DstReg.Index;
616 rpti.WriteMask = inst->DstReg.WriteMask;
617 rpti.TexSrcUnit = inst->TexSrcUnit;
618 rpti.TexSrcTarget = inst->TexSrcTarget;
619 rpti.SrcIndex = inst->SrcReg[0].Index;
620 rpti.SrcSwizzle = inst->SrcReg[0].Swizzle;
621
622 s->Error = s->Error || !s->Handler->EmitTex(s->UserData, &rpti);
623 }
624
625 if (s->Debug)
626 _mesa_printf(" END_TEX\n");
627 }
628
629
630 static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair,
631 struct prog_src_register src, GLboolean rgb, GLboolean alpha)
632 {
633 int candidate = -1;
634 int candidate_quality = -1;
635 int i;
636
637 if (!rgb && !alpha)
638 return 0;
639
640 GLuint constant;
641 GLuint index;
642
643 if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) {
644 constant = 0;
645 index = get_hw_reg(s, src.File, src.Index);
646 } else {
647 constant = 1;
648 s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index);
649 }
650
651 for(i = 0; i < 3; ++i) {
652 int q = 0;
653 if (rgb) {
654 if (pair->RGB.Src[i].Used) {
655 if (pair->RGB.Src[i].Constant != constant ||
656 pair->RGB.Src[i].Index != index)
657 continue;
658 q++;
659 }
660 }
661 if (alpha) {
662 if (pair->Alpha.Src[i].Used) {
663 if (pair->Alpha.Src[i].Constant != constant ||
664 pair->Alpha.Src[i].Index != index)
665 continue;
666 q++;
667 }
668 }
669 if (q > candidate_quality) {
670 candidate_quality = q;
671 candidate = i;
672 }
673 }
674
675 if (candidate >= 0) {
676 if (rgb) {
677 pair->RGB.Src[candidate].Used = 1;
678 pair->RGB.Src[candidate].Constant = constant;
679 pair->RGB.Src[candidate].Index = index;
680 }
681 if (alpha) {
682 pair->Alpha.Src[candidate].Used = 1;
683 pair->Alpha.Src[candidate].Constant = constant;
684 pair->Alpha.Src[candidate].Index = index;
685 }
686 }
687
688 return candidate;
689 }
690
691 /**
692 * Fill the given ALU instruction's opcodes and source operands into the given pair,
693 * if possible.
694 */
695 static GLboolean fill_instruction_into_pair(
696 struct pair_state *s,
697 struct radeon_pair_instruction *pair,
698 struct pair_state_instruction *pairinst)
699 {
700 struct prog_instruction *inst = &pairinst->Instruction;
701
702 ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP);
703 ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP);
704
705 if (pairinst->NeedRGB) {
706 if (pairinst->IsTranscendent)
707 pair->RGB.Opcode = OPCODE_REPL_ALPHA;
708 else
709 pair->RGB.Opcode = inst->Opcode;
710 if (inst->SaturateMode == SATURATE_ZERO_ONE)
711 pair->RGB.Saturate = 1;
712 }
713 if (pairinst->NeedAlpha) {
714 pair->Alpha.Opcode = inst->Opcode;
715 if (inst->SaturateMode == SATURATE_ZERO_ONE)
716 pair->Alpha.Saturate = 1;
717 }
718
719 int nargs = _mesa_num_inst_src_regs(inst->Opcode);
720 int i;
721
722 /* Special case for DDX/DDY (MDH/MDV). */
723 if (inst->Opcode == OPCODE_DDX || inst->Opcode == OPCODE_DDY) {
724 if (pair->RGB.Src[0].Used || pair->Alpha.Src[0].Used)
725 return GL_FALSE;
726 else
727 nargs++;
728 }
729
730 for(i = 0; i < nargs; ++i) {
731 int source;
732 if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
733 GLboolean srcrgb = GL_FALSE;
734 GLboolean srcalpha = GL_FALSE;
735 int j;
736 for(j = 0; j < 3; ++j) {
737 GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
738 if (swz < 3)
739 srcrgb = GL_TRUE;
740 else if (swz < 4)
741 srcalpha = GL_TRUE;
742 }
743 source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
744 if (source < 0)
745 return GL_FALSE;
746 pair->RGB.Arg[i].Source = source;
747 pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
748 pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
749 pair->RGB.Arg[i].Negate = !!(inst->SrcReg[i].Negate & (NEGATE_X | NEGATE_Y | NEGATE_Z));
750 }
751 if (pairinst->NeedAlpha) {
752 GLboolean srcrgb = GL_FALSE;
753 GLboolean srcalpha = GL_FALSE;
754 GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
755 if (swz < 3)
756 srcrgb = GL_TRUE;
757 else if (swz < 4)
758 srcalpha = GL_TRUE;
759 source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
760 if (source < 0)
761 return GL_FALSE;
762 pair->Alpha.Arg[i].Source = source;
763 pair->Alpha.Arg[i].Swizzle = swz;
764 pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
765 pair->Alpha.Arg[i].Negate = !!(inst->SrcReg[i].Negate & NEGATE_W);
766 }
767 }
768
769 return GL_TRUE;
770 }
771
772
773 /**
774 * Fill in the destination register information.
775 *
776 * This is split from filling in source registers because we want
777 * to avoid allocating hardware temporaries for destinations until
778 * we are absolutely certain that we're going to emit a certain
779 * instruction pairing.
780 */
781 static void fill_dest_into_pair(
782 struct pair_state *s,
783 struct radeon_pair_instruction *pair,
784 struct pair_state_instruction *pairinst)
785 {
786 struct prog_instruction *inst = &pairinst->Instruction;
787
788 if (inst->DstReg.File == PROGRAM_OUTPUT) {
789 if (inst->DstReg.Index == FRAG_RESULT_COLOR) {
790 pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
791 pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
792 } else if (inst->DstReg.Index == FRAG_RESULT_DEPTH) {
793 pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
794 }
795 } else {
796 GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
797 if (pairinst->NeedRGB) {
798 pair->RGB.DestIndex = hwindex;
799 pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
800 }
801 if (pairinst->NeedAlpha) {
802 pair->Alpha.DestIndex = hwindex;
803 pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
804 }
805 }
806 }
807
808
809 /**
810 * Find a good ALU instruction or pair of ALU instruction and emit it.
811 *
812 * Prefer emitting full ALU instructions, so that when we reach a point
813 * where no full ALU instruction can be emitted, we have more candidates
814 * for RGB/Alpha pairing.
815 */
816 static void emit_alu(struct pair_state *s)
817 {
818 struct radeon_pair_instruction pair;
819 struct pair_state_instruction *psi;
820
821 if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
822 if (s->ReadyFullALU) {
823 psi = s->ReadyFullALU;
824 s->ReadyFullALU = s->ReadyFullALU->NextReady;
825 } else if (s->ReadyRGB) {
826 psi = s->ReadyRGB;
827 s->ReadyRGB = s->ReadyRGB->NextReady;
828 } else {
829 psi = s->ReadyAlpha;
830 s->ReadyAlpha = s->ReadyAlpha->NextReady;
831 }
832
833 _mesa_bzero(&pair, sizeof(pair));
834 fill_instruction_into_pair(s, &pair, psi);
835 fill_dest_into_pair(s, &pair, psi);
836 commit_instruction(s, psi);
837 } else {
838 struct pair_state_instruction **prgb;
839 struct pair_state_instruction **palpha;
840
841 /* Some pairings might fail because they require too
842 * many source slots; try all possible pairings if necessary */
843 for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
844 for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
845 struct pair_state_instruction * psirgb = *prgb;
846 struct pair_state_instruction * psialpha = *palpha;
847 _mesa_bzero(&pair, sizeof(pair));
848 fill_instruction_into_pair(s, &pair, psirgb);
849 if (!fill_instruction_into_pair(s, &pair, psialpha))
850 continue;
851 *prgb = (*prgb)->NextReady;
852 *palpha = (*palpha)->NextReady;
853 fill_dest_into_pair(s, &pair, psirgb);
854 fill_dest_into_pair(s, &pair, psialpha);
855 commit_instruction(s, psirgb);
856 commit_instruction(s, psialpha);
857 goto success;
858 }
859 }
860
861 /* No success in pairing; just take the first RGB instruction */
862 psi = s->ReadyRGB;
863 s->ReadyRGB = s->ReadyRGB->NextReady;
864
865 _mesa_bzero(&pair, sizeof(pair));
866 fill_instruction_into_pair(s, &pair, psi);
867 fill_dest_into_pair(s, &pair, psi);
868 commit_instruction(s, psi);
869 success: ;
870 }
871
872 if (s->Debug)
873 radeonPrintPairInstruction(&pair);
874
875 s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
876 }
877
878
879 GLboolean radeonPairProgram(struct gl_program *program,
880 const struct radeon_pair_handler* handler, void *userdata)
881 {
882 struct pair_state s;
883
884 _mesa_bzero(&s, sizeof(s));
885 memory_pool_init(&s.Pool);
886 s.Program = program;
887 s.Handler = handler;
888 s.UserData = userdata;
889 s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
890 s.Verbose = GL_FALSE && s.Debug;
891
892 if (s.Debug)
893 _mesa_printf("Emit paired program\n");
894
895 scan_instructions(&s);
896 allocate_input_registers(&s);
897
898 while(!s.Error &&
899 (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
900 if (s.ReadyTEX)
901 emit_all_tex(&s);
902
903 while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
904 emit_alu(&s);
905 }
906
907 if (s.Debug)
908 _mesa_printf(" END\n");
909
910 memory_pool_destroy(&s.Pool);
911
912 return !s.Error;
913 }
914
915
916 static void print_pair_src(int i, struct radeon_pair_instruction_source* src)
917 {
918 _mesa_printf(" Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index);
919 }
920
921 static const char* opcode_string(GLuint opcode)
922 {
923 if (opcode == OPCODE_REPL_ALPHA)
924 return "SOP";
925 else
926 return _mesa_opcode_string(opcode);
927 }
928
929 static int num_pairinst_args(GLuint opcode)
930 {
931 if (opcode == OPCODE_REPL_ALPHA)
932 return 0;
933 else
934 return _mesa_num_inst_src_regs(opcode);
935 }
936
937 static char swizzle_char(GLuint swz)
938 {
939 switch(swz) {
940 case SWIZZLE_X: return 'x';
941 case SWIZZLE_Y: return 'y';
942 case SWIZZLE_Z: return 'z';
943 case SWIZZLE_W: return 'w';
944 case SWIZZLE_ZERO: return '0';
945 case SWIZZLE_ONE: return '1';
946 case SWIZZLE_NIL: return '_';
947 default: return '?';
948 }
949 }
950
951 void radeonPrintPairInstruction(struct radeon_pair_instruction *inst)
952 {
953 int nargs;
954 int i;
955
956 _mesa_printf(" RGB: ");
957 for(i = 0; i < 3; ++i) {
958 if (inst->RGB.Src[i].Used)
959 print_pair_src(i, inst->RGB.Src + i);
960 }
961 _mesa_printf("\n");
962 _mesa_printf(" Alpha:");
963 for(i = 0; i < 3; ++i) {
964 if (inst->Alpha.Src[i].Used)
965 print_pair_src(i, inst->Alpha.Src + i);
966 }
967 _mesa_printf("\n");
968
969 _mesa_printf(" %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : "");
970 if (inst->RGB.WriteMask)
971 _mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex,
972 (inst->RGB.WriteMask & 1) ? "x" : "",
973 (inst->RGB.WriteMask & 2) ? "y" : "",
974 (inst->RGB.WriteMask & 4) ? "z" : "");
975 if (inst->RGB.OutputWriteMask)
976 _mesa_printf(" COLOR.%s%s%s",
977 (inst->RGB.OutputWriteMask & 1) ? "x" : "",
978 (inst->RGB.OutputWriteMask & 2) ? "y" : "",
979 (inst->RGB.OutputWriteMask & 4) ? "z" : "");
980 nargs = num_pairinst_args(inst->RGB.Opcode);
981 for(i = 0; i < nargs; ++i) {
982 const char* abs = inst->RGB.Arg[i].Abs ? "|" : "";
983 const char* neg = inst->RGB.Arg[i].Negate ? "-" : "";
984 _mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source,
985 swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)),
986 swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)),
987 swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)),
988 abs);
989 }
990 _mesa_printf("\n");
991
992 _mesa_printf(" %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : "");
993 if (inst->Alpha.WriteMask)
994 _mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex);
995 if (inst->Alpha.OutputWriteMask)
996 _mesa_printf(" COLOR.w");
997 if (inst->Alpha.DepthWriteMask)
998 _mesa_printf(" DEPTH.w");
999 nargs = num_pairinst_args(inst->Alpha.Opcode);
1000 for(i = 0; i < nargs; ++i) {
1001 const char* abs = inst->Alpha.Arg[i].Abs ? "|" : "";
1002 const char* neg = inst->Alpha.Arg[i].Negate ? "-" : "";
1003 _mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source,
1004 swizzle_char(inst->Alpha.Arg[i].Swizzle), abs);
1005 }
1006 _mesa_printf("\n");
1007 }