nv50: use actual loads/stores if TEMPs are accessed indirectly
[mesa.git] / src / gallium / drivers / nv50 / nv50_pc_optimize.c
1 /*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 /* #define NV50PC_DEBUG */
24
25 #include "nv50_pc.h"
26
27 #define DESCEND_ARBITRARY(j, f) \
28 do { \
29 b->pass_seq = ctx->pc->pass_seq; \
30 \
31 for (j = 0; j < 2; ++j) \
32 if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \
33 f(ctx, b->out[j]); \
34 } while (0)
35
36 extern unsigned nv50_inst_min_size(struct nv_instruction *);
37
38 struct nv_pc_pass {
39 struct nv_pc *pc;
40 };
41
42 static INLINE boolean
43 values_equal(struct nv_value *a, struct nv_value *b)
44 {
45 /* XXX: sizes */
46 return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id);
47 }
48
49 static INLINE boolean
50 inst_commutation_check(struct nv_instruction *a,
51 struct nv_instruction *b)
52 {
53 int si, di;
54
55 for (di = 0; di < 4; ++di) {
56 if (!a->def[di])
57 break;
58 for (si = 0; si < 5; ++si) {
59 if (!b->src[si])
60 continue;
61 if (values_equal(a->def[di], b->src[si]->value))
62 return FALSE;
63 }
64 }
65
66 if (b->flags_src && b->flags_src->value == a->flags_def)
67 return FALSE;
68
69 return TRUE;
70 }
71
72 /* Check whether we can swap the order of the instructions,
73 * where a & b may be either the earlier or the later one.
74 */
75 static boolean
76 inst_commutation_legal(struct nv_instruction *a,
77 struct nv_instruction *b)
78 {
79 return inst_commutation_check(a, b) && inst_commutation_check(b, a);
80 }
81
82 static INLINE boolean
83 inst_cullable(struct nv_instruction *nvi)
84 {
85 if (nvi->opcode == NV_OP_STA)
86 return FALSE;
87 return (!(nvi->is_terminator || nvi->is_join ||
88 nvi->target ||
89 nvi->fixed ||
90 nv_nvi_refcount(nvi)));
91 }
92
93 static INLINE boolean
94 nvi_isnop(struct nv_instruction *nvi)
95 {
96 if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF)
97 return TRUE;
98
99 /* NOTE: 'fixed' now only means that it shouldn't be optimized away,
100 * but we can still remove it if it is a no-op move.
101 */
102 if (/* nvi->fixed || */
103 /* nvi->flags_src || */ /* cond. MOV to same register is still NOP */
104 nvi->flags_def ||
105 nvi->is_terminator ||
106 nvi->is_join)
107 return FALSE;
108
109 if (nvi->def[0] && nvi->def[0]->join->reg.id < 0)
110 return TRUE;
111
112 if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT)
113 return FALSE;
114
115 if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file)
116 return FALSE;
117
118 if (nvi->src[0]->value->join->reg.id < 0) {
119 NV50_DBGMSG("nvi_isnop: orphaned value detected\n");
120 return TRUE;
121 }
122
123 if (nvi->opcode == NV_OP_SELECT)
124 if (!values_equal(nvi->def[0], nvi->src[1]->value))
125 return FALSE;
126
127 return values_equal(nvi->def[0], nvi->src[0]->value);
128 }
129
130 struct nv_pass {
131 struct nv_pc *pc;
132 int n;
133 void *priv;
134 };
135
136 static int
137 nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b);
138
139 static void
140 nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
141 {
142 struct nv_pc *pc = (struct nv_pc *)priv;
143 struct nv_basic_block *in;
144 struct nv_instruction *nvi, *next;
145 int j;
146 uint size, n32 = 0;
147
148 for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j);
149 if (j >= 0) {
150 in = pc->bb_list[j];
151
152 /* check for no-op branches (BRA $PC+8) */
153 if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) {
154 in->bin_size -= 8;
155 pc->bin_size -= 8;
156
157 for (++j; j < pc->num_blocks; ++j)
158 pc->bb_list[j]->bin_pos -= 8;
159
160 nv_nvi_delete(in->exit);
161 }
162 b->bin_pos = in->bin_pos + in->bin_size;
163 }
164
165 pc->bb_list[pc->num_blocks++] = b;
166
167 /* visit node */
168
169 for (nvi = b->entry; nvi; nvi = next) {
170 next = nvi->next;
171 if (nvi_isnop(nvi))
172 nv_nvi_delete(nvi);
173 }
174
175 for (nvi = b->entry; nvi; nvi = next) {
176 next = nvi->next;
177
178 size = nv50_inst_min_size(nvi);
179 if (nvi->next && size < 8)
180 ++n32;
181 else
182 if ((n32 & 1) && nvi->next &&
183 nv50_inst_min_size(nvi->next) == 4 &&
184 inst_commutation_legal(nvi, nvi->next)) {
185 ++n32;
186 nv_nvi_permute(nvi, nvi->next);
187 next = nvi;
188 } else {
189 nvi->is_long = 1;
190
191 b->bin_size += n32 & 1;
192 if (n32 & 1)
193 nvi->prev->is_long = 1;
194 n32 = 0;
195 }
196 b->bin_size += 1 + nvi->is_long;
197 }
198
199 if (!b->entry) {
200 NV50_DBGMSG("block %p is now empty\n", b);
201 } else
202 if (!b->exit->is_long) {
203 assert(n32);
204 b->exit->is_long = 1;
205 b->bin_size += 1;
206
207 /* might have del'd a hole tail of instructions */
208 if (!b->exit->prev->is_long && !(n32 & 1)) {
209 b->bin_size += 1;
210 b->exit->prev->is_long = 1;
211 }
212 }
213 assert(!b->entry || (b->exit && b->exit->is_long));
214
215 pc->bin_size += b->bin_size *= 4;
216 }
217
218 static int
219 nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root)
220 {
221 struct nv_pass pass;
222
223 pass.pc = pc;
224
225 pc->pass_seq++;
226
227 nv_pass_flatten(&pass, root);
228
229 nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc);
230
231 return 0;
232 }
233
234 int
235 nv_pc_exec_pass2(struct nv_pc *pc)
236 {
237 int i, ret;
238
239 NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
240
241 pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0]));
242
243 pc->num_blocks = 0;
244
245 for (i = 0; i < pc->num_subroutines + 1; ++i)
246 if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
247 return ret;
248 return 0;
249 }
250
251 static INLINE boolean
252 is_cmem_load(struct nv_instruction *nvi)
253 {
254 return (nvi->opcode == NV_OP_LDA &&
255 nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) &&
256 nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15));
257 }
258
259 static INLINE boolean
260 is_smem_load(struct nv_instruction *nvi)
261 {
262 return (nvi->opcode == NV_OP_LDA &&
263 (nvi->src[0]->value->reg.file == NV_FILE_MEM_S ||
264 nvi->src[0]->value->reg.file <= NV_FILE_MEM_P));
265 }
266
267 static INLINE boolean
268 is_immd_move(struct nv_instruction *nvi)
269 {
270 return (nvi->opcode == NV_OP_MOV &&
271 nvi->src[0]->value->reg.file == NV_FILE_IMM);
272 }
273
274 static INLINE void
275 check_swap_src_0_1(struct nv_instruction *nvi)
276 {
277 static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
278
279 struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1];
280
281 if (!nv_op_commutative(nvi->opcode))
282 return;
283 assert(src0 && src1);
284
285 if (src1->value->reg.file == NV_FILE_IMM)
286 return;
287
288 if (is_cmem_load(src0->value->insn)) {
289 if (!is_cmem_load(src1->value->insn)) {
290 nvi->src[0] = src1;
291 nvi->src[1] = src0;
292 /* debug_printf("swapping cmem load to 1\n"); */
293 }
294 } else
295 if (is_smem_load(src1->value->insn)) {
296 if (!is_smem_load(src0->value->insn)) {
297 nvi->src[0] = src1;
298 nvi->src[1] = src0;
299 /* debug_printf("swapping smem load to 0\n"); */
300 }
301 }
302
303 if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0)
304 nvi->set_cond = cc_swapped[nvi->set_cond];
305 }
306
307 static int
308 nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
309 {
310 struct nv_instruction *nvi, *sti, *next;
311 int j;
312
313 for (sti = b->entry; sti; sti = next) {
314 next = sti->next;
315
316 /* only handling MOV to $oX here */
317 if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT)
318 continue;
319 if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA)
320 continue;
321
322 nvi = sti->src[0]->value->insn;
323 if (!nvi || nvi->opcode == NV_OP_PHI || nv_is_vector_op(nvi->opcode))
324 continue;
325 assert(nvi->def[0] == sti->src[0]->value);
326
327 if (nvi->def[0]->refc > 1)
328 continue;
329
330 /* cannot write to $oX when using immediate */
331 for (j = 0; j < 4 && nvi->src[j]; ++j)
332 if (nvi->src[j]->value->reg.file == NV_FILE_IMM)
333 break;
334 if (j < 4 && nvi->src[j])
335 continue;
336
337 nvi->def[0] = sti->def[0];
338 nvi->fixed = sti->fixed;
339
340 nv_nvi_delete(sti);
341 }
342 DESCEND_ARBITRARY(j, nv_pass_fold_stores);
343
344 return 0;
345 }
346
347 static int
348 nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
349 {
350 struct nv_instruction *nvi, *ld;
351 int j;
352
353 for (nvi = b->entry; nvi; nvi = nvi->next) {
354 check_swap_src_0_1(nvi);
355
356 for (j = 0; j < 3; ++j) {
357 if (!nvi->src[j])
358 break;
359 ld = nvi->src[j]->value->insn;
360 if (!ld)
361 continue;
362
363 if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) {
364 nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
365 continue;
366 }
367
368 if (ld->opcode != NV_OP_LDA)
369 continue;
370 if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value))
371 continue;
372
373 if (j == 0 && ld->src[4]) /* can't load shared mem */
374 continue;
375
376 /* fold it ! */ /* XXX: ref->insn */
377 nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
378 if (ld->src[4])
379 nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
380
381 if (!nv_nvi_refcount(ld))
382 nv_nvi_delete(ld);
383 }
384 }
385 DESCEND_ARBITRARY(j, nv_pass_fold_loads);
386
387 return 0;
388 }
389
390 static int
391 nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
392 {
393 int j;
394 struct nv_instruction *nvi, *mi, *next;
395 ubyte mod;
396
397 for (nvi = b->entry; nvi; nvi = next) {
398 next = nvi->next;
399 if (nvi->opcode == NV_OP_SUB) {
400 nvi->opcode = NV_OP_ADD;
401 nvi->src[1]->mod ^= NV_MOD_NEG;
402 }
403
404 /* should not put any modifiers on NEG and ABS */
405 assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod);
406 assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod);
407
408 for (j = 0; j < 4; ++j) {
409 if (!nvi->src[j])
410 break;
411
412 mi = nvi->src[j]->value->insn;
413 if (!mi)
414 continue;
415 if (mi->def[0]->refc > 1)
416 continue;
417
418 if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG;
419 else
420 if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS;
421 else
422 continue;
423
424 if (nvi->opcode == NV_OP_ABS)
425 mod &= ~(NV_MOD_NEG | NV_MOD_ABS);
426 else
427 if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) {
428 nvi->opcode = NV_OP_MOV;
429 mod = 0;
430 }
431
432 if (!(nv50_supported_src_mods(nvi->opcode, j) & mod))
433 continue;
434
435 nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value);
436
437 nvi->src[j]->mod ^= mod;
438 }
439
440 if (nvi->opcode == NV_OP_SAT) {
441 mi = nvi->src[0]->value->insn;
442
443 if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) {
444 mi->saturate = 1;
445 mi->def[0] = nvi->def[0];
446 nv_nvi_delete(nvi);
447 }
448 }
449 }
450 DESCEND_ARBITRARY(j, nv_pass_lower_mods);
451
452 return 0;
453 }
454
455 #define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL)
456
457 static void
458 modifiers_apply(uint32_t *val, ubyte type, ubyte mod)
459 {
460 if (mod & NV_MOD_ABS) {
461 if (type == NV_TYPE_F32)
462 *val &= 0x7fffffff;
463 else
464 if ((*val) & (1 << 31))
465 *val = ~(*val) + 1;
466 }
467 if (mod & NV_MOD_NEG) {
468 if (type == NV_TYPE_F32)
469 *val ^= 0x80000000;
470 else
471 *val = ~(*val) + 1;
472 }
473 }
474
475 static INLINE uint
476 modifiers_opcode(ubyte mod)
477 {
478 switch (mod) {
479 case NV_MOD_NEG: return NV_OP_NEG;
480 case NV_MOD_ABS: return NV_OP_ABS;
481 case 0:
482 return NV_OP_MOV;
483 default:
484 return NV_OP_NOP;
485 }
486 }
487
488 static void
489 constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
490 struct nv_value *src0, struct nv_value *src1)
491 {
492 struct nv_value *val;
493 union {
494 float f32;
495 uint32_t u32;
496 int32_t s32;
497 } u0, u1, u;
498 ubyte type;
499
500 if (!nvi->def[0])
501 return;
502 type = nvi->def[0]->reg.type;
503
504 u.u32 = 0;
505 u0.u32 = src0->reg.imm.u32;
506 u1.u32 = src1->reg.imm.u32;
507
508 modifiers_apply(&u0.u32, type, nvi->src[0]->mod);
509 modifiers_apply(&u1.u32, type, nvi->src[1]->mod);
510
511 switch (nvi->opcode) {
512 case NV_OP_MAD:
513 if (nvi->src[2]->value->reg.file != NV_FILE_GPR)
514 return;
515 /* fall through */
516 case NV_OP_MUL:
517 switch (type) {
518 case NV_TYPE_F32: u.f32 = u0.f32 * u1.f32; break;
519 case NV_TYPE_U32: u.u32 = u0.u32 * u1.u32; break;
520 case NV_TYPE_S32: u.s32 = u0.s32 * u1.s32; break;
521 default:
522 assert(0);
523 break;
524 }
525 break;
526 case NV_OP_ADD:
527 switch (type) {
528 case NV_TYPE_F32: u.f32 = u0.f32 + u1.f32; break;
529 case NV_TYPE_U32: u.u32 = u0.u32 + u1.u32; break;
530 case NV_TYPE_S32: u.s32 = u0.s32 + u1.s32; break;
531 default:
532 assert(0);
533 break;
534 }
535 break;
536 case NV_OP_SUB:
537 switch (type) {
538 case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; break;
539 case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; break;
540 case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; break;
541 default:
542 assert(0);
543 break;
544 }
545 break;
546 default:
547 return;
548 }
549
550 nvi->opcode = NV_OP_MOV;
551
552 val = new_value(pc, NV_FILE_IMM, type);
553
554 val->reg.imm.u32 = u.u32;
555
556 nv_reference(pc, &nvi->src[1], NULL);
557 nv_reference(pc, &nvi->src[0], val);
558
559 if (nvi->src[2]) { /* from MAD */
560 nvi->src[1] = nvi->src[0];
561 nvi->src[0] = nvi->src[2];
562 nvi->src[2] = NULL;
563 nvi->opcode = NV_OP_ADD;
564 }
565 }
566
567 static void
568 constant_operand(struct nv_pc *pc,
569 struct nv_instruction *nvi, struct nv_value *val, int s)
570 {
571 union {
572 float f32;
573 uint32_t u32;
574 int32_t s32;
575 } u;
576 int t = s ? 0 : 1;
577 uint op;
578 ubyte type;
579
580 if (!nvi->def[0])
581 return;
582 type = nvi->def[0]->reg.type;
583
584 u.u32 = val->reg.imm.u32;
585 modifiers_apply(&u.u32, type, nvi->src[s]->mod);
586
587 switch (nvi->opcode) {
588 case NV_OP_MUL:
589 if ((type == NV_TYPE_F32 && u.f32 == 1.0f) ||
590 (NV_TYPE_ISINT(type) && u.u32 == 1)) {
591 if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
592 break;
593 nvi->opcode = op;
594 nv_reference(pc, &nvi->src[s], NULL);
595 nvi->src[0] = nvi->src[t];
596 nvi->src[1] = NULL;
597 } else
598 if ((type == NV_TYPE_F32 && u.f32 == 2.0f) ||
599 (NV_TYPE_ISINT(type) && u.u32 == 2)) {
600 nvi->opcode = NV_OP_ADD;
601 nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
602 nvi->src[s]->mod = nvi->src[t]->mod;
603 } else
604 if (type == NV_TYPE_F32 && u.f32 == -1.0f) {
605 if (nvi->src[t]->mod & NV_MOD_NEG)
606 nvi->opcode = NV_OP_MOV;
607 else
608 nvi->opcode = NV_OP_NEG;
609 nv_reference(pc, &nvi->src[s], NULL);
610 nvi->src[0] = nvi->src[t];
611 nvi->src[1] = NULL;
612 } else
613 if (type == NV_TYPE_F32 && u.f32 == -2.0f) {
614 nvi->opcode = NV_OP_ADD;
615 nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
616 nvi->src[s]->mod = (nvi->src[t]->mod ^= NV_MOD_NEG);
617 } else
618 if (u.u32 == 0) {
619 nvi->opcode = NV_OP_MOV;
620 nv_reference(pc, &nvi->src[t], NULL);
621 if (s) {
622 nvi->src[0] = nvi->src[1];
623 nvi->src[1] = NULL;
624 }
625 }
626 break;
627 case NV_OP_ADD:
628 if (u.u32 == 0) {
629 if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
630 break;
631 nvi->opcode = op;
632 nv_reference(pc, &nvi->src[s], NULL);
633 nvi->src[0] = nvi->src[t];
634 nvi->src[1] = NULL;
635 }
636 break;
637 case NV_OP_RCP:
638 u.f32 = 1.0f / u.f32;
639 (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
640 nvi->opcode = NV_OP_MOV;
641 assert(s == 0);
642 nv_reference(pc, &nvi->src[0], val);
643 break;
644 case NV_OP_RSQ:
645 u.f32 = 1.0f / sqrtf(u.f32);
646 (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
647 nvi->opcode = NV_OP_MOV;
648 assert(s == 0);
649 nv_reference(pc, &nvi->src[0], val);
650 break;
651 default:
652 break;
653 }
654
655 if (nvi->opcode == NV_OP_MOV && nvi->flags_def) {
656 struct nv_instruction *cvt = new_instruction_at(pc, nvi, NV_OP_CVT);
657
658 nv_reference(pc, &cvt->src[0], nvi->def[0]);
659
660 cvt->flags_def = nvi->flags_def;
661 nvi->flags_def = NULL;
662 }
663 }
664
665 static int
666 nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
667 {
668 struct nv_instruction *nvi, *next;
669 int j;
670
671 for (nvi = b->entry; nvi; nvi = next) {
672 struct nv_value *src0, *src1, *src;
673 int mod;
674
675 next = nvi->next;
676
677 src0 = nvcg_find_immediate(nvi->src[0]);
678 src1 = nvcg_find_immediate(nvi->src[1]);
679
680 if (src0 && src1)
681 constant_expression(ctx->pc, nvi, src0, src1);
682 else {
683 if (src0)
684 constant_operand(ctx->pc, nvi, src0, 0);
685 else
686 if (src1)
687 constant_operand(ctx->pc, nvi, src1, 1);
688 }
689
690 /* try to combine MUL, ADD into MAD */
691 if (nvi->opcode != NV_OP_ADD)
692 continue;
693
694 src0 = nvi->src[0]->value;
695 src1 = nvi->src[1]->value;
696
697 if (SRC_IS_MUL(src0) && src0->refc == 1)
698 src = src0;
699 else
700 if (SRC_IS_MUL(src1) && src1->refc == 1)
701 src = src1;
702 else
703 continue;
704
705 nvi->opcode = NV_OP_MAD;
706 mod = nvi->src[(src == src0) ? 0 : 1]->mod;
707 nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL);
708 nvi->src[2] = nvi->src[(src == src0) ? 1 : 0];
709
710 assert(!(mod & ~NV_MOD_NEG));
711 nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
712 nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
713 nvi->src[0]->mod = src->insn->src[0]->mod ^ mod;
714 nvi->src[1]->mod = src->insn->src[1]->mod;
715 }
716 DESCEND_ARBITRARY(j, nv_pass_lower_arith);
717
718 return 0;
719 }
720
721 /* TODO: redundant store elimination */
722
723 struct load_record {
724 struct load_record *next;
725 uint64_t data;
726 struct nv_value *value;
727 };
728
729 #define LOAD_RECORD_POOL_SIZE 1024
730
731 struct nv_pass_reld_elim {
732 struct nv_pc *pc;
733
734 struct load_record *imm;
735 struct load_record *mem_s;
736 struct load_record *mem_v;
737 struct load_record *mem_c[16];
738 struct load_record *mem_l;
739
740 struct load_record pool[LOAD_RECORD_POOL_SIZE];
741 int alloc;
742 };
743
744 /* TODO: properly handle loads from l[] memory in the presence of stores */
745 static int
746 nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
747 {
748 struct load_record **rec, *it;
749 struct nv_instruction *ld, *next;
750 uint64_t data;
751 struct nv_value *val;
752 int j;
753
754 for (ld = b->entry; ld; ld = next) {
755 next = ld->next;
756 if (!ld->src[0])
757 continue;
758 val = ld->src[0]->value;
759 rec = NULL;
760
761 if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
762 data = val->reg.id;
763 rec = &ctx->mem_v;
764 } else
765 if (ld->opcode == NV_OP_LDA) {
766 data = val->reg.id;
767 if (val->reg.file >= NV_FILE_MEM_C(0) &&
768 val->reg.file <= NV_FILE_MEM_C(15))
769 rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)];
770 else
771 if (val->reg.file == NV_FILE_MEM_S)
772 rec = &ctx->mem_s;
773 else
774 if (val->reg.file == NV_FILE_MEM_L)
775 rec = &ctx->mem_l;
776 } else
777 if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) {
778 data = val->reg.imm.u32;
779 rec = &ctx->imm;
780 }
781
782 if (!rec || !ld->def[0]->refc)
783 continue;
784
785 for (it = *rec; it; it = it->next)
786 if (it->data == data)
787 break;
788
789 if (it) {
790 if (ld->def[0]->reg.id >= 0)
791 it->value = ld->def[0];
792 else
793 if (!ld->fixed)
794 nvcg_replace_value(ctx->pc, ld->def[0], it->value);
795 } else {
796 if (ctx->alloc == LOAD_RECORD_POOL_SIZE)
797 continue;
798 it = &ctx->pool[ctx->alloc++];
799 it->next = *rec;
800 it->data = data;
801 it->value = ld->def[0];
802 *rec = it;
803 }
804 }
805
806 ctx->imm = NULL;
807 ctx->mem_s = NULL;
808 ctx->mem_v = NULL;
809 for (j = 0; j < 16; ++j)
810 ctx->mem_c[j] = NULL;
811 ctx->mem_l = NULL;
812 ctx->alloc = 0;
813
814 DESCEND_ARBITRARY(j, nv_pass_reload_elim);
815
816 return 0;
817 }
818
819 static int
820 nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b)
821 {
822 int i, c, j;
823
824 for (i = 0; i < ctx->pc->num_instructions; ++i) {
825 struct nv_instruction *nvi = &ctx->pc->instructions[i];
826 struct nv_value *def[4];
827
828 if (!nv_is_vector_op(nvi->opcode))
829 continue;
830 nvi->tex_mask = 0;
831
832 for (c = 0; c < 4; ++c) {
833 if (nvi->def[c]->refc)
834 nvi->tex_mask |= 1 << c;
835 def[c] = nvi->def[c];
836 }
837
838 j = 0;
839 for (c = 0; c < 4; ++c)
840 if (nvi->tex_mask & (1 << c))
841 nvi->def[j++] = def[c];
842 for (c = 0; c < 4; ++c)
843 if (!(nvi->tex_mask & (1 << c)))
844 nvi->def[j++] = def[c];
845 assert(j == 4);
846 }
847 return 0;
848 }
849
850 struct nv_pass_dce {
851 struct nv_pc *pc;
852 uint removed;
853 };
854
855 static int
856 nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
857 {
858 int j;
859 struct nv_instruction *nvi, *next;
860
861 for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) {
862 next = nvi->next;
863
864 if (inst_cullable(nvi)) {
865 nv_nvi_delete(nvi);
866
867 ++ctx->removed;
868 }
869 }
870 DESCEND_ARBITRARY(j, nv_pass_dce);
871
872 return 0;
873 }
874
875 /* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE.
876 * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with
877 * BREAK and dummy ELSE block.
878 */
879 static INLINE boolean
880 bb_is_if_else_endif(struct nv_basic_block *bb)
881 {
882 if (!bb->out[0] || !bb->out[1])
883 return FALSE;
884
885 if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) {
886 return (bb->out[0]->out[1] == bb->out[1]->out[0] &&
887 !bb->out[1]->out[1]);
888 } else {
889 return (bb->out[0]->out[0] == bb->out[1]->out[0] &&
890 !bb->out[0]->out[1] &&
891 !bb->out[1]->out[1]);
892 }
893 }
894
895 /* predicate instructions and remove branch at the end */
896 static void
897 predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b,
898 struct nv_value *p, ubyte cc)
899 {
900 struct nv_instruction *nvi;
901
902 if (!b->entry)
903 return;
904 for (nvi = b->entry; nvi->next; nvi = nvi->next) {
905 if (!nvi_isnop(nvi)) {
906 nvi->cc = cc;
907 nv_reference(pc, &nvi->flags_src, p);
908 }
909 }
910
911 if (nvi->opcode == NV_OP_BRA)
912 nv_nvi_delete(nvi);
913 else
914 if (!nvi_isnop(nvi)) {
915 nvi->cc = cc;
916 nv_reference(pc, &nvi->flags_src, p);
917 }
918 }
919
920 /* NOTE: Run this after register allocation, we can just cut out the cflow
921 * instructions and hook the predicates to the conditional OPs if they are
922 * not using immediates; better than inserting SELECT to join definitions.
923 *
924 * NOTE: Should adapt prior optimization to make this possible more often.
925 */
926 static int
927 nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
928 {
929 struct nv_instruction *nvi;
930 struct nv_value *pred;
931 int i;
932 int n0 = 0, n1 = 0;
933
934 if (bb_is_if_else_endif(b)) {
935
936 NV50_DBGMSG("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
937
938 for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0)
939 if (!nv50_nvi_can_predicate(nvi))
940 break;
941 if (!nvi) {
942 for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1)
943 if (!nv50_nvi_can_predicate(nvi))
944 break;
945 #ifdef NV50_PC_DEBUG
946 if (nvi) {
947 debug_printf("cannot predicate: "); nv_print_instruction(nvi);
948 }
949 } else {
950 debug_printf("cannot predicate: "); nv_print_instruction(nvi);
951 #endif
952 }
953
954 if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */
955 assert(b->exit && b->exit->flags_src);
956 pred = b->exit->flags_src->value;
957
958 predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U);
959 predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ);
960
961 assert(b->exit && b->exit->opcode == NV_OP_BRA);
962 nv_nvi_delete(b->exit);
963
964 if (b->exit && b->exit->opcode == NV_OP_JOINAT)
965 nv_nvi_delete(b->exit);
966
967 i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0;
968
969 if ((nvi = b->out[0]->out[i]->entry)) {
970 nvi->is_join = 0;
971 if (nvi->opcode == NV_OP_JOIN)
972 nv_nvi_delete(nvi);
973 }
974 }
975 }
976 DESCEND_ARBITRARY(i, nv_pass_flatten);
977
978 return 0;
979 }
980
981 /* local common subexpression elimination, stupid O(n^2) implementation */
982 static int
983 nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
984 {
985 struct nv_instruction *ir, *ik, *next;
986 struct nv_instruction *entry = b->phi ? b->phi : b->entry;
987 int s;
988 unsigned int reps;
989
990 do {
991 reps = 0;
992 for (ir = entry; ir; ir = next) {
993 next = ir->next;
994 for (ik = entry; ik != ir; ik = ik->next) {
995 if (ir->opcode != ik->opcode || ir->fixed)
996 continue;
997
998 if (!ir->def[0] || !ik->def[0] ||
999 ik->opcode == NV_OP_LDA ||
1000 ik->opcode == NV_OP_STA ||
1001 ik->opcode == NV_OP_MOV ||
1002 nv_is_vector_op(ik->opcode))
1003 continue; /* ignore loads, stores & moves */
1004
1005 if (ik->src[4] || ir->src[4])
1006 continue; /* don't mess with address registers */
1007
1008 if (ik->flags_src || ir->flags_src ||
1009 ik->flags_def || ir->flags_def)
1010 continue; /* and also not with flags, for now */
1011
1012 if (ik->def[0]->reg.file == NV_FILE_OUT ||
1013 ir->def[0]->reg.file == NV_FILE_OUT ||
1014 !values_equal(ik->def[0], ir->def[0]))
1015 continue;
1016
1017 for (s = 0; s < 3; ++s) {
1018 struct nv_value *a, *b;
1019
1020 if (!ik->src[s]) {
1021 if (ir->src[s])
1022 break;
1023 continue;
1024 }
1025 if (ik->src[s]->mod != ir->src[s]->mod)
1026 break;
1027 a = ik->src[s]->value;
1028 b = ir->src[s]->value;
1029 if (a == b)
1030 continue;
1031 if (a->reg.file != b->reg.file ||
1032 a->reg.id < 0 ||
1033 a->reg.id != b->reg.id)
1034 break;
1035 }
1036 if (s == 3) {
1037 nv_nvi_delete(ir);
1038 ++reps;
1039 nvcg_replace_value(ctx->pc, ir->def[0], ik->def[0]);
1040 break;
1041 }
1042 }
1043 }
1044 } while(reps);
1045
1046 DESCEND_ARBITRARY(s, nv_pass_cse);
1047
1048 return 0;
1049 }
1050
1051 static int
1052 nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
1053 {
1054 struct nv_pass_reld_elim *reldelim;
1055 struct nv_pass pass;
1056 struct nv_pass_dce dce;
1057 int ret;
1058
1059 pass.n = 0;
1060 pass.pc = pc;
1061
1062 /* Do this first, so we don't have to pay attention
1063 * to whether sources are supported memory loads.
1064 */
1065 pc->pass_seq++;
1066 ret = nv_pass_lower_arith(&pass, root);
1067 if (ret)
1068 return ret;
1069
1070 pc->pass_seq++;
1071 ret = nv_pass_fold_loads(&pass, root);
1072 if (ret)
1073 return ret;
1074
1075 pc->pass_seq++;
1076 ret = nv_pass_fold_stores(&pass, root);
1077 if (ret)
1078 return ret;
1079
1080 if (pc->opt_reload_elim) {
1081 reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
1082 reldelim->pc = pc;
1083 pc->pass_seq++;
1084 ret = nv_pass_reload_elim(reldelim, root);
1085 FREE(reldelim);
1086 if (ret)
1087 return ret;
1088 }
1089
1090 pc->pass_seq++;
1091 ret = nv_pass_cse(&pass, root);
1092 if (ret)
1093 return ret;
1094
1095 pc->pass_seq++;
1096 ret = nv_pass_lower_mods(&pass, root);
1097 if (ret)
1098 return ret;
1099
1100 dce.pc = pc;
1101 do {
1102 dce.removed = 0;
1103 pc->pass_seq++;
1104 ret = nv_pass_dce(&dce, root);
1105 if (ret)
1106 return ret;
1107 } while (dce.removed);
1108
1109 ret = nv_pass_tex_mask(&pass, root);
1110 if (ret)
1111 return ret;
1112
1113 return ret;
1114 }
1115
1116 int
1117 nv_pc_exec_pass0(struct nv_pc *pc)
1118 {
1119 int i, ret;
1120
1121 for (i = 0; i < pc->num_subroutines + 1; ++i)
1122 if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i])))
1123 return ret;
1124 return 0;
1125 }