a385b342345c2dd7eff0de15f0d8de2d275994f0
[mesa.git] / src / amd / compiler / aco_optimizer.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <algorithm>
29 #include <math.h>
30
31 #include "aco_ir.h"
32 #include "util/half_float.h"
33 #include "util/u_math.h"
34
35 namespace aco {
36
37 #ifndef NDEBUG
38 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr)
39 {
40 if (cond) {
41 char *out;
42 size_t outsize;
43 FILE *memf = open_memstream(&out, &outsize);
44
45 fprintf(memf, "%s: ", msg);
46 aco_print_instr(instr, memf);
47 fclose(memf);
48
49 aco_perfwarn(program, out);
50 free(out);
51
52 if (debug_flags & DEBUG_PERFWARN)
53 exit(1);
54 }
55 }
56 #endif
57
58 /**
59 * The optimizer works in 4 phases:
60 * (1) The first pass collects information for each ssa-def,
61 * propagates reg->reg operands of the same type, inline constants
62 * and neg/abs input modifiers.
63 * (2) The second pass combines instructions like mad, omod, clamp and
64 * propagates sgpr's on VALU instructions.
65 * This pass depends on information collected in the first pass.
66 * (3) The third pass goes backwards, and selects instructions,
67 * i.e. decides if a mad instruction is profitable and eliminates dead code.
68 * (4) The fourth pass cleans up the sequence: literals get applied and dead
69 * instructions are removed from the sequence.
70 */
71
72
73 struct mad_info {
74 aco_ptr<Instruction> add_instr;
75 uint32_t mul_temp_id;
76 uint16_t literal_idx;
77 bool check_literal;
78
79 mad_info(aco_ptr<Instruction> instr, uint32_t id)
80 : add_instr(std::move(instr)), mul_temp_id(id), check_literal(false) {}
81 };
82
83 enum Label {
84 label_vec = 1 << 0,
85 label_constant_32bit = 1 << 1,
86 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
87 * 32-bit operations but this shouldn't cause any issues because we don't
88 * look through any conversions */
89 label_abs = 1 << 2,
90 label_neg = 1 << 3,
91 label_mul = 1 << 4,
92 label_temp = 1 << 5,
93 label_literal = 1 << 6,
94 label_mad = 1 << 7,
95 label_omod2 = 1 << 8,
96 label_omod4 = 1 << 9,
97 label_omod5 = 1 << 10,
98 label_clamp = 1 << 12,
99 label_undefined = 1 << 14,
100 label_vcc = 1 << 15,
101 label_b2f = 1 << 16,
102 label_add_sub = 1 << 17,
103 label_bitwise = 1 << 18,
104 label_minmax = 1 << 19,
105 label_vopc = 1 << 20,
106 label_uniform_bool = 1 << 21,
107 label_constant_64bit = 1 << 22,
108 label_uniform_bitwise = 1 << 23,
109 label_scc_invert = 1 << 24,
110 label_vcc_hint = 1 << 25,
111 label_scc_needed = 1 << 26,
112 label_b2i = 1 << 27,
113 label_constant_16bit = 1 << 29,
114 };
115
116 static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub |
117 label_bitwise | label_uniform_bitwise | label_minmax | label_vopc;
118 static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp;
119
120 static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
121 static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
122 label_scc_invert | label_b2i;
123 static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
124
125 struct ssa_info {
126 uint64_t label;
127 union {
128 uint32_t val;
129 Temp temp;
130 Instruction* instr;
131 };
132
133 ssa_info() : label(0) {}
134
135 void add_label(Label new_label)
136 {
137 /* Since all the instr_usedef_labels use instr for the same thing
138 * (indicating the defining instruction), there is usually no need to
139 * clear any other instr labels. */
140 if (new_label & instr_usedef_labels)
141 label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
142
143 if (new_label & instr_mod_labels) {
144 label &= ~instr_labels;
145 label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
146 }
147
148 if (new_label & temp_labels) {
149 label &= ~temp_labels;
150 label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
151 }
152
153 uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
154 if (new_label & const_labels) {
155 label &= ~val_labels | const_labels;
156 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
157 } else if (new_label & val_labels) {
158 label &= ~val_labels;
159 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
160 }
161
162 label |= new_label;
163 }
164
165 void set_vec(Instruction* vec)
166 {
167 add_label(label_vec);
168 instr = vec;
169 }
170
171 bool is_vec()
172 {
173 return label & label_vec;
174 }
175
176 void set_constant(chip_class chip, uint64_t constant)
177 {
178 Operand op16((uint16_t)constant);
179 Operand op32((uint32_t)constant);
180 add_label(label_literal);
181 val = constant;
182
183 if (chip >= GFX8 && !op16.isLiteral())
184 add_label(label_constant_16bit);
185
186 if (!op32.isLiteral() || ((uint32_t)constant == 0x3e22f983 && chip >= GFX8))
187 add_label(label_constant_32bit);
188
189 if (constant <= 64) {
190 add_label(label_constant_64bit);
191 } else if (constant >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
192 add_label(label_constant_64bit);
193 } else if (constant == 0x3FE0000000000000) { /* 0.5 */
194 add_label(label_constant_64bit);
195 } else if (constant == 0xBFE0000000000000) { /* -0.5 */
196 add_label(label_constant_64bit);
197 } else if (constant == 0x3FF0000000000000) { /* 1.0 */
198 add_label(label_constant_64bit);
199 } else if (constant == 0xBFF0000000000000) { /* -1.0 */
200 add_label(label_constant_64bit);
201 } else if (constant == 0x4000000000000000) { /* 2.0 */
202 add_label(label_constant_64bit);
203 } else if (constant == 0xC000000000000000) { /* -2.0 */
204 add_label(label_constant_64bit);
205 } else if (constant == 0x4010000000000000) { /* 4.0 */
206 add_label(label_constant_64bit);
207 } else if (constant == 0xC010000000000000) { /* -4.0 */
208 add_label(label_constant_64bit);
209 }
210
211 if (label & label_constant_64bit) {
212 val = Operand(constant).constantValue();
213 if (val != constant)
214 label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
215 }
216 }
217
218 bool is_constant(unsigned bits)
219 {
220 switch (bits) {
221 case 8:
222 return label & label_literal;
223 case 16:
224 return label & label_constant_16bit;
225 case 32:
226 return label & label_constant_32bit;
227 case 64:
228 return label & label_constant_64bit;
229 }
230 return false;
231 }
232
233 bool is_literal(unsigned bits)
234 {
235 bool is_lit = label & label_literal;
236 switch (bits) {
237 case 8:
238 return false;
239 case 16:
240 return is_lit && ~(label & label_constant_16bit);
241 case 32:
242 return is_lit && ~(label & label_constant_32bit);
243 case 64:
244 return false;
245 }
246 return false;
247 }
248
249 bool is_constant_or_literal(unsigned bits)
250 {
251 if (bits == 64)
252 return label & label_constant_64bit;
253 else
254 return label & label_literal;
255 }
256
257 void set_abs(Temp abs_temp)
258 {
259 add_label(label_abs);
260 temp = abs_temp;
261 }
262
263 bool is_abs()
264 {
265 return label & label_abs;
266 }
267
268 void set_neg(Temp neg_temp)
269 {
270 add_label(label_neg);
271 temp = neg_temp;
272 }
273
274 bool is_neg()
275 {
276 return label & label_neg;
277 }
278
279 void set_neg_abs(Temp neg_abs_temp)
280 {
281 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
282 temp = neg_abs_temp;
283 }
284
285 void set_mul(Instruction* mul)
286 {
287 add_label(label_mul);
288 instr = mul;
289 }
290
291 bool is_mul()
292 {
293 return label & label_mul;
294 }
295
296 void set_temp(Temp tmp)
297 {
298 add_label(label_temp);
299 temp = tmp;
300 }
301
302 bool is_temp()
303 {
304 return label & label_temp;
305 }
306
307 void set_mad(Instruction* mad, uint32_t mad_info_idx)
308 {
309 add_label(label_mad);
310 mad->pass_flags = mad_info_idx;
311 instr = mad;
312 }
313
314 bool is_mad()
315 {
316 return label & label_mad;
317 }
318
319 void set_omod2(Instruction* mul)
320 {
321 add_label(label_omod2);
322 instr = mul;
323 }
324
325 bool is_omod2()
326 {
327 return label & label_omod2;
328 }
329
330 void set_omod4(Instruction* mul)
331 {
332 add_label(label_omod4);
333 instr = mul;
334 }
335
336 bool is_omod4()
337 {
338 return label & label_omod4;
339 }
340
341 void set_omod5(Instruction* mul)
342 {
343 add_label(label_omod5);
344 instr = mul;
345 }
346
347 bool is_omod5()
348 {
349 return label & label_omod5;
350 }
351
352 void set_clamp(Instruction *med3)
353 {
354 add_label(label_clamp);
355 instr = med3;
356 }
357
358 bool is_clamp()
359 {
360 return label & label_clamp;
361 }
362
363 void set_undefined()
364 {
365 add_label(label_undefined);
366 }
367
368 bool is_undefined()
369 {
370 return label & label_undefined;
371 }
372
373 void set_vcc(Temp vcc)
374 {
375 add_label(label_vcc);
376 temp = vcc;
377 }
378
379 bool is_vcc()
380 {
381 return label & label_vcc;
382 }
383
384 void set_b2f(Temp val)
385 {
386 add_label(label_b2f);
387 temp = val;
388 }
389
390 bool is_b2f()
391 {
392 return label & label_b2f;
393 }
394
395 void set_add_sub(Instruction *add_sub_instr)
396 {
397 add_label(label_add_sub);
398 instr = add_sub_instr;
399 }
400
401 bool is_add_sub()
402 {
403 return label & label_add_sub;
404 }
405
406 void set_bitwise(Instruction *bitwise_instr)
407 {
408 add_label(label_bitwise);
409 instr = bitwise_instr;
410 }
411
412 bool is_bitwise()
413 {
414 return label & label_bitwise;
415 }
416
417 void set_uniform_bitwise()
418 {
419 add_label(label_uniform_bitwise);
420 }
421
422 bool is_uniform_bitwise()
423 {
424 return label & label_uniform_bitwise;
425 }
426
427 void set_minmax(Instruction *minmax_instr)
428 {
429 add_label(label_minmax);
430 instr = minmax_instr;
431 }
432
433 bool is_minmax()
434 {
435 return label & label_minmax;
436 }
437
438 void set_vopc(Instruction *vopc_instr)
439 {
440 add_label(label_vopc);
441 instr = vopc_instr;
442 }
443
444 bool is_vopc()
445 {
446 return label & label_vopc;
447 }
448
449 void set_scc_needed()
450 {
451 add_label(label_scc_needed);
452 }
453
454 bool is_scc_needed()
455 {
456 return label & label_scc_needed;
457 }
458
459 void set_scc_invert(Temp scc_inv)
460 {
461 add_label(label_scc_invert);
462 temp = scc_inv;
463 }
464
465 bool is_scc_invert()
466 {
467 return label & label_scc_invert;
468 }
469
470 void set_uniform_bool(Temp uniform_bool)
471 {
472 add_label(label_uniform_bool);
473 temp = uniform_bool;
474 }
475
476 bool is_uniform_bool()
477 {
478 return label & label_uniform_bool;
479 }
480
481 void set_vcc_hint()
482 {
483 add_label(label_vcc_hint);
484 }
485
486 bool is_vcc_hint()
487 {
488 return label & label_vcc_hint;
489 }
490
491 void set_b2i(Temp val)
492 {
493 add_label(label_b2i);
494 temp = val;
495 }
496
497 bool is_b2i()
498 {
499 return label & label_b2i;
500 }
501
502 };
503
504 struct opt_ctx {
505 Program* program;
506 std::vector<aco_ptr<Instruction>> instructions;
507 ssa_info* info;
508 std::pair<uint32_t,Temp> last_literal;
509 std::vector<mad_info> mad_infos;
510 std::vector<uint16_t> uses;
511 };
512
513 struct CmpInfo {
514 aco_opcode ordered;
515 aco_opcode unordered;
516 aco_opcode ordered_swapped;
517 aco_opcode unordered_swapped;
518 aco_opcode inverse;
519 aco_opcode f32;
520 unsigned size;
521 };
522
523 ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info);
524
525 bool can_swap_operands(aco_ptr<Instruction>& instr)
526 {
527 if (instr->operands[0].isConstant() ||
528 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
529 return false;
530
531 switch (instr->opcode) {
532 case aco_opcode::v_add_u32:
533 case aco_opcode::v_add_co_u32:
534 case aco_opcode::v_add_co_u32_e64:
535 case aco_opcode::v_add_i32:
536 case aco_opcode::v_add_f16:
537 case aco_opcode::v_add_f32:
538 case aco_opcode::v_mul_f16:
539 case aco_opcode::v_mul_f32:
540 case aco_opcode::v_or_b32:
541 case aco_opcode::v_and_b32:
542 case aco_opcode::v_xor_b32:
543 case aco_opcode::v_max_f16:
544 case aco_opcode::v_max_f32:
545 case aco_opcode::v_min_f16:
546 case aco_opcode::v_min_f32:
547 case aco_opcode::v_max_i32:
548 case aco_opcode::v_min_i32:
549 case aco_opcode::v_max_u32:
550 case aco_opcode::v_min_u32:
551 case aco_opcode::v_max_i16:
552 case aco_opcode::v_min_i16:
553 case aco_opcode::v_max_u16:
554 case aco_opcode::v_min_u16:
555 case aco_opcode::v_max_i16_e64:
556 case aco_opcode::v_min_i16_e64:
557 case aco_opcode::v_max_u16_e64:
558 case aco_opcode::v_min_u16_e64:
559 return true;
560 case aco_opcode::v_sub_f16:
561 instr->opcode = aco_opcode::v_subrev_f16;
562 return true;
563 case aco_opcode::v_sub_f32:
564 instr->opcode = aco_opcode::v_subrev_f32;
565 return true;
566 case aco_opcode::v_sub_co_u32:
567 instr->opcode = aco_opcode::v_subrev_co_u32;
568 return true;
569 case aco_opcode::v_sub_u16:
570 instr->opcode = aco_opcode::v_subrev_u16;
571 return true;
572 case aco_opcode::v_sub_u32:
573 instr->opcode = aco_opcode::v_subrev_u32;
574 return true;
575 default: {
576 CmpInfo info;
577 get_cmp_info(instr->opcode, &info);
578 if (info.ordered == instr->opcode) {
579 instr->opcode = info.ordered_swapped;
580 return true;
581 }
582 if (info.unordered == instr->opcode) {
583 instr->opcode = info.unordered_swapped;
584 return true;
585 }
586 return false;
587 }
588 }
589 }
590
591 bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
592 {
593 if (instr->isVOP3())
594 return true;
595
596 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
597 return false;
598
599 if (instr->isDPP() || instr->isSDWA())
600 return false;
601
602 return instr->opcode != aco_opcode::v_madmk_f32 &&
603 instr->opcode != aco_opcode::v_madak_f32 &&
604 instr->opcode != aco_opcode::v_madmk_f16 &&
605 instr->opcode != aco_opcode::v_madak_f16 &&
606 instr->opcode != aco_opcode::v_fmamk_f32 &&
607 instr->opcode != aco_opcode::v_fmaak_f32 &&
608 instr->opcode != aco_opcode::v_fmamk_f16 &&
609 instr->opcode != aco_opcode::v_fmaak_f16 &&
610 instr->opcode != aco_opcode::v_readlane_b32 &&
611 instr->opcode != aco_opcode::v_writelane_b32 &&
612 instr->opcode != aco_opcode::v_readfirstlane_b32;
613 }
614
615 bool can_apply_sgprs(aco_ptr<Instruction>& instr)
616 {
617 return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
618 instr->opcode != aco_opcode::v_readlane_b32 &&
619 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
620 instr->opcode != aco_opcode::v_writelane_b32 &&
621 instr->opcode != aco_opcode::v_writelane_b32_e64;
622 }
623
624 void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
625 {
626 if (instr->isVOP3())
627 return;
628
629 aco_ptr<Instruction> tmp = std::move(instr);
630 Format format = asVOP3(tmp->format);
631 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
632 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
633 for (unsigned i = 0; i < instr->definitions.size(); i++) {
634 instr->definitions[i] = tmp->definitions[i];
635 if (instr->definitions[i].isTemp()) {
636 ssa_info& info = ctx.info[instr->definitions[i].tempId()];
637 if (info.label & instr_usedef_labels && info.instr == tmp.get())
638 info.instr = instr.get();
639 }
640 }
641 /* we don't need to update any instr_mod_labels because they either haven't
642 * been applied yet or this instruction isn't dead and so they've been ignored */
643 }
644
645 /* only covers special cases */
646 bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
647 {
648 switch (opcode) {
649 case aco_opcode::v_interp_p2_f32:
650 case aco_opcode::v_mac_f32:
651 case aco_opcode::v_writelane_b32:
652 case aco_opcode::v_writelane_b32_e64:
653 case aco_opcode::v_cndmask_b32:
654 return operand != 2;
655 case aco_opcode::s_addk_i32:
656 case aco_opcode::s_mulk_i32:
657 case aco_opcode::p_wqm:
658 case aco_opcode::p_extract_vector:
659 case aco_opcode::p_split_vector:
660 case aco_opcode::v_readlane_b32:
661 case aco_opcode::v_readlane_b32_e64:
662 case aco_opcode::v_readfirstlane_b32:
663 return operand != 0;
664 default:
665 return true;
666 }
667 }
668
669 bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
670 {
671 if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
672 instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
673 return operand != 1;
674 return true;
675 }
676
677 /* check constant bus and literal limitations */
678 bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
679 {
680 int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
681 Operand literal32(s1);
682 Operand literal64(s2);
683 unsigned num_sgprs = 0;
684 unsigned sgpr[] = {0, 0};
685
686 for (unsigned i = 0; i < num_operands; i++) {
687 Operand op = operands[i];
688
689 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
690 /* two reads of the same SGPR count as 1 to the limit */
691 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
692 if (num_sgprs < 2)
693 sgpr[num_sgprs++] = op.tempId();
694 limit--;
695 if (limit < 0)
696 return false;
697 }
698 } else if (op.isLiteral()) {
699 if (ctx.program->chip_class < GFX10)
700 return false;
701
702 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
703 return false;
704 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
705 return false;
706
707 /* Any number of 32-bit literals counts as only 1 to the limit. Same
708 * (but separately) for 64-bit literals. */
709 if (op.size() == 1 && literal32.isUndefined()) {
710 limit--;
711 literal32 = op;
712 } else if (op.size() == 2 && literal64.isUndefined()) {
713 limit--;
714 literal64 = op;
715 }
716
717 if (limit < 0)
718 return false;
719 }
720 }
721
722 return true;
723 }
724
725 bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
726 {
727 Operand op = instr->operands[op_index];
728
729 if (!op.isTemp())
730 return false;
731 Temp tmp = op.getTemp();
732 if (!ctx.info[tmp.id()].is_add_sub())
733 return false;
734
735 Instruction *add_instr = ctx.info[tmp.id()].instr;
736
737 switch (add_instr->opcode) {
738 case aco_opcode::v_add_u32:
739 case aco_opcode::v_add_co_u32:
740 case aco_opcode::v_add_co_u32_e64:
741 case aco_opcode::s_add_i32:
742 case aco_opcode::s_add_u32:
743 break;
744 default:
745 return false;
746 }
747 if (prevent_overflow && !add_instr->definitions[0].isNUW())
748 return false;
749
750 if (add_instr->usesModifiers())
751 return false;
752
753 for (unsigned i = 0; i < 2; i++) {
754 if (add_instr->operands[i].isConstant()) {
755 *offset = add_instr->operands[i].constantValue();
756 } else if (add_instr->operands[i].isTemp() &&
757 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
758 *offset = ctx.info[add_instr->operands[i].tempId()].val;
759 } else {
760 continue;
761 }
762 if (!add_instr->operands[!i].isTemp())
763 continue;
764
765 uint32_t offset2 = 0;
766 if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
767 *offset += offset2;
768 } else {
769 *base = add_instr->operands[!i].getTemp();
770 }
771 return true;
772 }
773
774 return false;
775 }
776
777 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
778 {
779 if (instr->format == Format::PSEUDO)
780 return instr->operands[index].bytes() * 8u;
781 else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32)
782 return index == 2 ? 64 : 32;
783 else if (instr->isVALU() || instr->isSALU())
784 return instr_info.operand_size[(int)instr->opcode];
785 else
786 return 0;
787 }
788
789 Operand get_constant_op(opt_ctx &ctx, ssa_info info, uint32_t bits)
790 {
791 if (bits == 8)
792 return Operand((uint8_t)info.val);
793 if (bits == 16)
794 return Operand((uint16_t)info.val);
795 // TODO: this functions shouldn't be needed if we store Operand instead of value.
796 Operand op(info.val, bits == 64);
797 if (info.is_literal(32) && info.val == 0x3e22f983 && ctx.program->chip_class >= GFX8)
798 op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */
799 return op;
800 }
801
802 bool fixed_to_exec(Operand op)
803 {
804 return op.isFixed() && op.physReg() == exec;
805 }
806
807 void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
808 {
809 if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
810 ASSERTED bool all_const = false;
811 for (Operand& op : instr->operands)
812 all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
813 perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
814 }
815
816 for (unsigned i = 0; i < instr->operands.size(); i++)
817 {
818 if (!instr->operands[i].isTemp())
819 continue;
820
821 ssa_info info = ctx.info[instr->operands[i].tempId()];
822 /* propagate undef */
823 if (info.is_undefined() && is_phi(instr))
824 instr->operands[i] = Operand(instr->operands[i].regClass());
825 /* propagate reg->reg of same type */
826 if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
827 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
828 info = ctx.info[info.temp.id()];
829 }
830
831 /* SALU / PSEUDO: propagate inline constants */
832 if (instr->isSALU() || instr->format == Format::PSEUDO) {
833 bool is_subdword = false;
834 // TODO: optimize SGPR propagation for subdword pseudo instructions on gfx9+
835 if (instr->format == Format::PSEUDO) {
836 is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(),
837 [] (const Definition& def) { return def.regClass().is_subdword();});
838 is_subdword = is_subdword || std::any_of(instr->operands.begin(), instr->operands.end(),
839 [] (const Operand& op) { return op.hasRegClass() && op.regClass().is_subdword();});
840 if (is_subdword && ctx.program->chip_class < GFX9)
841 continue;
842 }
843
844 if (info.is_temp() && info.temp.type() == RegType::sgpr) {
845 instr->operands[i].setTemp(info.temp);
846 info = ctx.info[info.temp.id()];
847 } else if (info.is_temp() && info.temp.type() == RegType::vgpr) {
848 /* propagate vgpr if it can take it */
849 switch (instr->opcode) {
850 case aco_opcode::p_create_vector:
851 case aco_opcode::p_split_vector:
852 case aco_opcode::p_extract_vector:
853 case aco_opcode::p_phi: {
854 const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(),
855 [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;});
856 if (all_vgpr) {
857 instr->operands[i] = Operand(info.temp);
858 info = ctx.info[info.temp.id()];
859 }
860 break;
861 }
862 default:
863 break;
864 }
865 }
866 unsigned bits = get_operand_size(instr, i);
867 if ((info.is_constant(bits) || (!is_subdword && info.is_literal(bits) && instr->format == Format::PSEUDO)) &&
868 !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
869 instr->operands[i] = get_constant_op(ctx, info, bits);
870 continue;
871 }
872 }
873
874 /* VALU: propagate neg, abs & inline constants */
875 else if (instr->isVALU()) {
876 if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
877 instr->operands[i].setTemp(info.temp);
878 info = ctx.info[info.temp.id()];
879 }
880
881 /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */
882 unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
883 can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
884
885 if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
886 if (!instr->isDPP())
887 to_VOP3(ctx, instr);
888 instr->operands[i] = Operand(info.temp);
889 if (instr->isDPP())
890 static_cast<DPP_instruction*>(instr.get())->abs[i] = true;
891 else
892 static_cast<VOP3A_instruction*>(instr.get())->abs[i] = true;
893 }
894 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
895 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
896 instr->operands[i].setTemp(info.temp);
897 continue;
898 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
899 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
900 instr->operands[i].setTemp(info.temp);
901 continue;
902 } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
903 if (!instr->isDPP())
904 to_VOP3(ctx, instr);
905 instr->operands[i].setTemp(info.temp);
906 if (instr->isDPP())
907 static_cast<DPP_instruction*>(instr.get())->neg[i] = true;
908 else
909 static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true;
910 continue;
911 }
912 unsigned bits = get_operand_size(instr, i);
913 if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i)) {
914 Operand op = get_constant_op(ctx, info, bits);
915 perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
916 if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
917 instr->operands[i] = op;
918 continue;
919 } else if (!instr->isVOP3() && can_swap_operands(instr)) {
920 instr->operands[i] = instr->operands[0];
921 instr->operands[0] = op;
922 continue;
923 } else if (can_use_VOP3(ctx, instr)) {
924 to_VOP3(ctx, instr);
925 instr->operands[i] = op;
926 continue;
927 }
928 }
929 }
930
931 /* MUBUF: propagate constants and combine additions */
932 else if (instr->format == Format::MUBUF) {
933 MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
934 Temp base;
935 uint32_t offset;
936 while (info.is_temp())
937 info = ctx.info[info.temp.id()];
938
939 /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
940 * overflow for scratch accesses works only on GFX9+ and saddr overflow
941 * never works. Since swizzling is the only thing that separates
942 * scratch accesses and other accesses and swizzling changing how
943 * addressing works significantly, this probably applies to swizzled
944 * MUBUF accesses. */
945 bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
946 bool saddr_prevent_overflow = mubuf->swizzled;
947
948 if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
949 assert(!mubuf->idxen);
950 instr->operands[1] = Operand(v1);
951 mubuf->offset += info.val;
952 mubuf->offen = false;
953 continue;
954 } else if (i == 2 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
955 instr->operands[2] = Operand((uint32_t) 0);
956 mubuf->offset += info.val;
957 continue;
958 } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
959 base.regClass() == v1 && mubuf->offset + offset < 4096) {
960 assert(!mubuf->idxen);
961 instr->operands[1].setTemp(base);
962 mubuf->offset += offset;
963 continue;
964 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
965 base.regClass() == s1 && mubuf->offset + offset < 4096) {
966 instr->operands[i].setTemp(base);
967 mubuf->offset += offset;
968 continue;
969 }
970 }
971
972 /* DS: combine additions */
973 else if (instr->format == Format::DS) {
974
975 DS_instruction *ds = static_cast<DS_instruction *>(instr.get());
976 Temp base;
977 uint32_t offset;
978 bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
979 if (has_usable_ds_offset &&
980 i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
981 base.regClass() == instr->operands[i].regClass() &&
982 instr->opcode != aco_opcode::ds_swizzle_b32) {
983 if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
984 instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
985 unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3;
986 unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2;
987
988 if ((offset & mask) == 0 &&
989 ds->offset0 + (offset >> shifts) <= 255 &&
990 ds->offset1 + (offset >> shifts) <= 255) {
991 instr->operands[i].setTemp(base);
992 ds->offset0 += offset >> shifts;
993 ds->offset1 += offset >> shifts;
994 }
995 } else {
996 if (ds->offset0 + offset <= 65535) {
997 instr->operands[i].setTemp(base);
998 ds->offset0 += offset;
999 }
1000 }
1001 }
1002 }
1003
1004 /* SMEM: propagate constants and combine additions */
1005 else if (instr->format == Format::SMEM) {
1006
1007 SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr.get());
1008 Temp base;
1009 uint32_t offset;
1010 bool prevent_overflow = smem->operands[0].size() > 2 || smem->prevent_overflow;
1011 if (i == 1 && info.is_constant_or_literal(32) &&
1012 ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
1013 (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
1014 (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
1015 instr->operands[i] = Operand(info.val);
1016 continue;
1017 } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
1018 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
1019 if (soe &&
1020 (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
1021 ctx.info[smem->operands.back().tempId()].val != 0)) {
1022 continue;
1023 }
1024 if (soe) {
1025 smem->operands[1] = Operand(offset);
1026 smem->operands.back() = Operand(base);
1027 } else {
1028 SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size());
1029 new_instr->operands[0] = smem->operands[0];
1030 new_instr->operands[1] = Operand(offset);
1031 if (smem->definitions.empty())
1032 new_instr->operands[2] = smem->operands[2];
1033 new_instr->operands.back() = Operand(base);
1034 if (!smem->definitions.empty())
1035 new_instr->definitions[0] = smem->definitions[0];
1036 new_instr->sync = smem->sync;
1037 new_instr->glc = smem->glc;
1038 new_instr->dlc = smem->dlc;
1039 new_instr->nv = smem->nv;
1040 new_instr->disable_wqm = smem->disable_wqm;
1041 instr.reset(new_instr);
1042 smem = static_cast<SMEM_instruction *>(instr.get());
1043 }
1044 continue;
1045 }
1046 }
1047
1048 else if (instr->format == Format::PSEUDO_BRANCH) {
1049 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1050 /* Flip the branch instruction to get rid of the scc_invert instruction */
1051 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
1052 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
1053 }
1054 }
1055 }
1056
1057 /* if this instruction doesn't define anything, return */
1058 if (instr->definitions.empty())
1059 return;
1060
1061 if ((uint16_t) instr->format & (uint16_t) Format::VOPC) {
1062 ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
1063 return;
1064 }
1065
1066 switch (instr->opcode) {
1067 case aco_opcode::p_create_vector: {
1068 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
1069 instr->operands[0].regClass() == instr->definitions[0].regClass();
1070 if (copy_prop) {
1071 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1072 break;
1073 }
1074
1075 unsigned num_ops = instr->operands.size();
1076 for (const Operand& op : instr->operands) {
1077 if (op.isTemp() && ctx.info[op.tempId()].is_vec())
1078 num_ops += ctx.info[op.tempId()].instr->operands.size() - 1;
1079 }
1080 if (num_ops != instr->operands.size()) {
1081 aco_ptr<Instruction> old_vec = std::move(instr);
1082 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_ops, 1));
1083 instr->definitions[0] = old_vec->definitions[0];
1084 unsigned k = 0;
1085 for (Operand& old_op : old_vec->operands) {
1086 if (old_op.isTemp() && ctx.info[old_op.tempId()].is_vec()) {
1087 for (unsigned j = 0; j < ctx.info[old_op.tempId()].instr->operands.size(); j++) {
1088 Operand op = ctx.info[old_op.tempId()].instr->operands[j];
1089 if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
1090 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
1091 op.setTemp(ctx.info[op.tempId()].temp);
1092 instr->operands[k++] = op;
1093 }
1094 } else {
1095 instr->operands[k++] = old_op;
1096 }
1097 }
1098 assert(k == num_ops);
1099 }
1100
1101 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1102 break;
1103 }
1104 case aco_opcode::p_split_vector: {
1105 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1106
1107 if (info.is_constant_or_literal(32)) {
1108 uint32_t val = info.val;
1109 for (Definition def : instr->definitions) {
1110 uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
1111 ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask);
1112 val >>= def.bytes() * 8u;
1113 }
1114 break;
1115 } else if (!info.is_vec()) {
1116 break;
1117 }
1118
1119 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1120 unsigned split_offset = 0;
1121 unsigned vec_offset = 0;
1122 unsigned vec_index = 0;
1123 for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) {
1124 while (vec_offset < split_offset && vec_index < vec->operands.size())
1125 vec_offset += vec->operands[vec_index++].bytes();
1126
1127 if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
1128 continue;
1129
1130 Operand vec_op = vec->operands[vec_index];
1131 if (vec_op.isConstant()) {
1132 ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, vec_op.constantValue64());
1133 } else if (vec_op.isUndefined()) {
1134 ctx.info[instr->definitions[i].tempId()].set_undefined();
1135 } else {
1136 assert(vec_op.isTemp());
1137 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1138 }
1139 }
1140 break;
1141 }
1142 case aco_opcode::p_extract_vector: { /* mov */
1143 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1144 const unsigned index = instr->operands[1].constantValue();
1145 const unsigned dst_offset = index * instr->definitions[0].bytes();
1146
1147 if (info.is_constant_or_literal(32)) {
1148 uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
1149 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, (info.val >> (dst_offset * 8u)) & mask);
1150 break;
1151 } else if (!info.is_vec()) {
1152 break;
1153 }
1154
1155 /* check if we index directly into a vector element */
1156 Instruction* vec = info.instr;
1157 unsigned offset = 0;
1158
1159 for (const Operand& op : vec->operands) {
1160 if (offset < dst_offset) {
1161 offset += op.bytes();
1162 continue;
1163 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1164 break;
1165 }
1166
1167 /* convert this extract into a copy instruction */
1168 instr->opcode = aco_opcode::p_parallelcopy;
1169 instr->operands.pop_back();
1170 instr->operands[0] = op;
1171
1172 if (op.isConstant()) {
1173 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, op.constantValue64());
1174 } else if (op.isUndefined()) {
1175 ctx.info[instr->definitions[0].tempId()].set_undefined();
1176 } else {
1177 assert(op.isTemp());
1178 ctx.info[instr->definitions[0].tempId()].set_temp(op.getTemp());
1179 }
1180 break;
1181 }
1182 break;
1183 }
1184 case aco_opcode::s_mov_b32: /* propagate */
1185 case aco_opcode::s_mov_b64:
1186 case aco_opcode::v_mov_b32:
1187 case aco_opcode::p_as_uniform:
1188 if (instr->definitions[0].isFixed()) {
1189 /* don't copy-propagate copies into fixed registers */
1190 } else if (instr->usesModifiers()) {
1191 // TODO
1192 } else if (instr->operands[0].isConstant()) {
1193 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64());
1194 } else if (instr->operands[0].isTemp()) {
1195 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1196 } else {
1197 assert(instr->operands[0].isFixed());
1198 }
1199 break;
1200 case aco_opcode::p_is_helper:
1201 if (!ctx.program->needs_wqm)
1202 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
1203 break;
1204 case aco_opcode::s_movk_i32: {
1205 uint32_t v = static_cast<SOPK_instruction*>(instr.get())->imm;
1206 v = v & 0x8000 ? (v | 0xffff0000) : v;
1207 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v);
1208 break;
1209 }
1210 case aco_opcode::v_bfrev_b32:
1211 case aco_opcode::s_brev_b32: {
1212 if (instr->operands[0].isConstant()) {
1213 uint32_t v = util_bitreverse(instr->operands[0].constantValue());
1214 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v);
1215 }
1216 break;
1217 }
1218 case aco_opcode::s_bfm_b32: {
1219 if (instr->operands[0].isConstant() && instr->operands[1].isConstant()) {
1220 unsigned size = instr->operands[0].constantValue() & 0x1f;
1221 unsigned start = instr->operands[1].constantValue() & 0x1f;
1222 uint32_t v = ((1u << size) - 1u) << start;
1223 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, v);
1224 }
1225 break;
1226 }
1227 case aco_opcode::v_mul_f16:
1228 case aco_opcode::v_mul_f32: { /* omod */
1229 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
1230
1231 /* TODO: try to move the negate/abs modifier to the consumer instead */
1232 if (instr->usesModifiers())
1233 break;
1234
1235 bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1236
1237 for (unsigned i = 0; i < 2; i++) {
1238 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1239 if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1240 ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
1241 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1242 ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
1243 } else if (instr->operands[!i].constantValue() == (fp16 ? 0xb800 : 0x3f000000)) { /* 0.5 */
1244 ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
1245 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) &&
1246 !(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32)) { /* 1.0 */
1247 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
1248 } else {
1249 continue;
1250 }
1251 break;
1252 }
1253 }
1254 break;
1255 }
1256 case aco_opcode::v_and_b32: { /* abs */
1257 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1258 instr->operands[1].getTemp().type() == RegType::vgpr &&
1259 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) ||
1260 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu))))
1261 ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp());
1262 else
1263 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1264 break;
1265 }
1266 case aco_opcode::v_xor_b32: { /* neg */
1267 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1268 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) ||
1269 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) {
1270 if (ctx.info[instr->operands[1].tempId()].is_neg()) {
1271 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1272 } else if (instr->operands[1].getTemp().type() == RegType::vgpr) {
1273 if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */
1274 instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp);
1275 instr->opcode = aco_opcode::v_or_b32;
1276 ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp());
1277 } else {
1278 ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp());
1279 }
1280 }
1281 } else {
1282 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1283 }
1284 break;
1285 }
1286 case aco_opcode::v_med3_f16:
1287 case aco_opcode::v_med3_f32: { /* clamp */
1288 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1289 if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1290 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1291 vop3->omod != 0 || vop3->opsel != 0)
1292 break;
1293
1294 unsigned idx = 0;
1295 bool found_zero = false, found_one = false;
1296 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1297 for (unsigned i = 0; i < 3; i++)
1298 {
1299 if (instr->operands[i].constantEquals(0))
1300 found_zero = true;
1301 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1302 found_one = true;
1303 else
1304 idx = i;
1305 }
1306 if (found_zero && found_one && instr->operands[idx].isTemp())
1307 ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
1308 break;
1309 }
1310 case aco_opcode::v_cndmask_b32:
1311 if (instr->operands[0].constantEquals(0) &&
1312 instr->operands[1].constantEquals(0xFFFFFFFF))
1313 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1314 else if (instr->operands[0].constantEquals(0) &&
1315 instr->operands[1].constantEquals(0x3f800000u))
1316 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1317 else if (instr->operands[0].constantEquals(0) &&
1318 instr->operands[1].constantEquals(1))
1319 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1320
1321 ctx.info[instr->operands[2].tempId()].set_vcc_hint();
1322 break;
1323 case aco_opcode::v_cmp_lg_u32:
1324 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1325 instr->operands[0].constantEquals(0) &&
1326 instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc())
1327 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1328 break;
1329 case aco_opcode::p_phi:
1330 case aco_opcode::p_linear_phi: {
1331 /* lower_bool_phis() can create phis like this */
1332 bool all_same_temp = instr->operands[0].isTemp();
1333 /* this check is needed when moving uniform loop counters out of a divergent loop */
1334 if (all_same_temp)
1335 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1336 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1337 if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId())
1338 all_same_temp = false;
1339 }
1340 if (all_same_temp) {
1341 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1342 } else {
1343 bool all_undef = instr->operands[0].isUndefined();
1344 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1345 if (!instr->operands[i].isUndefined())
1346 all_undef = false;
1347 }
1348 if (all_undef)
1349 ctx.info[instr->definitions[0].tempId()].set_undefined();
1350 }
1351 break;
1352 }
1353 case aco_opcode::v_add_u32:
1354 case aco_opcode::v_add_co_u32:
1355 case aco_opcode::v_add_co_u32_e64:
1356 case aco_opcode::s_add_i32:
1357 case aco_opcode::s_add_u32:
1358 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1359 break;
1360 case aco_opcode::s_not_b32:
1361 case aco_opcode::s_not_b64:
1362 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1363 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1364 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
1365 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1366 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1367 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1368 }
1369 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1370 break;
1371 case aco_opcode::s_and_b32:
1372 case aco_opcode::s_and_b64:
1373 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
1374 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1375 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */
1376 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
1377 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp);
1378 break;
1379 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1380 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */
1381 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1382 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1383 break;
1384 } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {
1385 Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;
1386 /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */
1387 if (vopc_instr->pass_flags == instr->pass_flags) {
1388 assert(instr->pass_flags > 0);
1389 ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp());
1390 break;
1391 }
1392 }
1393 }
1394 /* fallthrough */
1395 case aco_opcode::s_or_b32:
1396 case aco_opcode::s_or_b64:
1397 case aco_opcode::s_xor_b32:
1398 case aco_opcode::s_xor_b64:
1399 if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) {
1400 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise());
1401 })) {
1402 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1403 }
1404 /* fallthrough */
1405 case aco_opcode::s_lshl_b32:
1406 case aco_opcode::v_or_b32:
1407 case aco_opcode::v_lshlrev_b32:
1408 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1409 break;
1410 case aco_opcode::v_min_f32:
1411 case aco_opcode::v_min_f16:
1412 case aco_opcode::v_min_u32:
1413 case aco_opcode::v_min_i32:
1414 case aco_opcode::v_min_u16:
1415 case aco_opcode::v_min_i16:
1416 case aco_opcode::v_max_f32:
1417 case aco_opcode::v_max_f16:
1418 case aco_opcode::v_max_u32:
1419 case aco_opcode::v_max_i32:
1420 case aco_opcode::v_max_u16:
1421 case aco_opcode::v_max_i16:
1422 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
1423 break;
1424 case aco_opcode::s_cselect_b64:
1425 case aco_opcode::s_cselect_b32:
1426 if (instr->operands[0].constantEquals((unsigned) -1) &&
1427 instr->operands[1].constantEquals(0)) {
1428 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
1429 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
1430 }
1431 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
1432 /* Flip the operands to get rid of the scc_invert instruction */
1433 std::swap(instr->operands[0], instr->operands[1]);
1434 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
1435 }
1436 break;
1437 case aco_opcode::p_wqm:
1438 if (instr->operands[0].isTemp() &&
1439 ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1440 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1441 }
1442 break;
1443 default:
1444 break;
1445 }
1446 }
1447
1448 ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info)
1449 {
1450 info->ordered = aco_opcode::num_opcodes;
1451 info->unordered = aco_opcode::num_opcodes;
1452 info->ordered_swapped = aco_opcode::num_opcodes;
1453 info->unordered_swapped = aco_opcode::num_opcodes;
1454 switch (op) {
1455 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
1456 case aco_opcode::v_cmp_##ord##_f##sz:\
1457 case aco_opcode::v_cmp_n##unord##_f##sz:\
1458 info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\
1459 info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\
1460 info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\
1461 info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\
1462 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\
1463 info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\
1464 info->size = sz;\
1465 return true;
1466 #define CMP(ord, unord, ord_swap, unord_swap) \
1467 CMP2(ord, unord, ord_swap, unord_swap, 16)\
1468 CMP2(ord, unord, ord_swap, unord_swap, 32)\
1469 CMP2(ord, unord, ord_swap, unord_swap, 64)
1470 CMP(lt, /*n*/ge, gt, /*n*/le)
1471 CMP(eq, /*n*/lg, eq, /*n*/lg)
1472 CMP(le, /*n*/gt, ge, /*n*/lt)
1473 CMP(gt, /*n*/le, lt, /*n*/le)
1474 CMP(lg, /*n*/eq, lg, /*n*/eq)
1475 CMP(ge, /*n*/lt, le, /*n*/gt)
1476 #undef CMP
1477 #undef CMP2
1478 #define ORD_TEST(sz) \
1479 case aco_opcode::v_cmp_u_f##sz:\
1480 info->f32 = aco_opcode::v_cmp_u_f32;\
1481 info->inverse = aco_opcode::v_cmp_o_f##sz;\
1482 info->size = sz;\
1483 return true;\
1484 case aco_opcode::v_cmp_o_f##sz:\
1485 info->f32 = aco_opcode::v_cmp_o_f32;\
1486 info->inverse = aco_opcode::v_cmp_u_f##sz;\
1487 info->size = sz;\
1488 return true;
1489 ORD_TEST(16)
1490 ORD_TEST(32)
1491 ORD_TEST(64)
1492 #undef ORD_TEST
1493 default:
1494 return false;
1495 }
1496 }
1497
1498 aco_opcode get_ordered(aco_opcode op)
1499 {
1500 CmpInfo info;
1501 return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
1502 }
1503
1504 aco_opcode get_unordered(aco_opcode op)
1505 {
1506 CmpInfo info;
1507 return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
1508 }
1509
1510 aco_opcode get_inverse(aco_opcode op)
1511 {
1512 CmpInfo info;
1513 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
1514 }
1515
1516 aco_opcode get_f32_cmp(aco_opcode op)
1517 {
1518 CmpInfo info;
1519 return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
1520 }
1521
1522 unsigned get_cmp_bitsize(aco_opcode op)
1523 {
1524 CmpInfo info;
1525 return get_cmp_info(op, &info) ? info.size : 0;
1526 }
1527
1528 bool is_cmp(aco_opcode op)
1529 {
1530 CmpInfo info;
1531 return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
1532 }
1533
1534 unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
1535 {
1536 if (ctx.info[tmp.id()].is_temp())
1537 return ctx.info[tmp.id()].temp.id();
1538 else
1539 return tmp.id();
1540 }
1541
1542 void decrease_uses(opt_ctx &ctx, Instruction* instr)
1543 {
1544 if (!--ctx.uses[instr->definitions[0].tempId()]) {
1545 for (const Operand& op : instr->operands) {
1546 if (op.isTemp())
1547 ctx.uses[op.tempId()]--;
1548 }
1549 }
1550 }
1551
1552 Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
1553 {
1554 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
1555 return nullptr;
1556 if (!ignore_uses && ctx.uses[op.tempId()] > 1)
1557 return nullptr;
1558
1559 Instruction *instr = ctx.info[op.tempId()].instr;
1560
1561 if (instr->definitions.size() == 2) {
1562 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
1563 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1564 return nullptr;
1565 }
1566
1567 return instr;
1568 }
1569
1570 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
1571 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
1572 bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1573 {
1574 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1575 return false;
1576 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1577 return false;
1578
1579 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1580
1581 bool neg[2] = {false, false};
1582 bool abs[2] = {false, false};
1583 uint8_t opsel = 0;
1584 Instruction *op_instr[2];
1585 Temp op[2];
1586
1587 unsigned bitsize = 0;
1588 for (unsigned i = 0; i < 2; i++) {
1589 op_instr[i] = follow_operand(ctx, instr->operands[i], true);
1590 if (!op_instr[i])
1591 return false;
1592
1593 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1594 unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
1595
1596 if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
1597 return false;
1598 if (bitsize && op_bitsize != bitsize)
1599 return false;
1600 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
1601 return false;
1602
1603 if (op_instr[i]->isVOP3()) {
1604 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(op_instr[i]);
1605 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1606 return false;
1607 neg[i] = vop3->neg[0];
1608 abs[i] = vop3->abs[0];
1609 opsel |= (vop3->opsel & 1) << i;
1610 }
1611
1612 Temp op0 = op_instr[i]->operands[0].getTemp();
1613 Temp op1 = op_instr[i]->operands[1].getTemp();
1614 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
1615 return false;
1616
1617 op[i] = op1;
1618 bitsize = op_bitsize;
1619 }
1620
1621 if (op[1].type() == RegType::sgpr)
1622 std::swap(op[0], op[1]);
1623 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
1624 if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
1625 return false;
1626
1627 ctx.uses[op[0].id()]++;
1628 ctx.uses[op[1].id()]++;
1629 decrease_uses(ctx, op_instr[0]);
1630 decrease_uses(ctx, op_instr[1]);
1631
1632 aco_opcode new_op = aco_opcode::num_opcodes;
1633 switch (bitsize) {
1634 case 16:
1635 new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16;
1636 break;
1637 case 32:
1638 new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1639 break;
1640 case 64:
1641 new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64;
1642 break;
1643 }
1644 Instruction *new_instr;
1645 if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
1646 VOP3A_instruction *vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1647 for (unsigned i = 0; i < 2; i++) {
1648 vop3->neg[i] = neg[i];
1649 vop3->abs[i] = abs[i];
1650 }
1651 vop3->opsel = opsel;
1652 new_instr = static_cast<Instruction *>(vop3);
1653 } else {
1654 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1655 }
1656 new_instr->operands[0] = Operand(op[0]);
1657 new_instr->operands[1] = Operand(op[1]);
1658 new_instr->definitions[0] = instr->definitions[0];
1659
1660 ctx.info[instr->definitions[0].tempId()].label = 0;
1661 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
1662
1663 instr.reset(new_instr);
1664
1665 return true;
1666 }
1667
1668 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
1669 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
1670 bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1671 {
1672 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1673 return false;
1674 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1675 return false;
1676
1677 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1678 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1679
1680 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1681 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1682 if (!nan_test || !cmp)
1683 return false;
1684
1685 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
1686 std::swap(nan_test, cmp);
1687 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
1688 return false;
1689
1690 if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
1691 return false;
1692
1693 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1694 return false;
1695 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
1696 return false;
1697
1698 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
1699 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
1700 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1701 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1702 if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
1703 return false;
1704 if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
1705 return false;
1706
1707 ctx.uses[cmp->operands[0].tempId()]++;
1708 ctx.uses[cmp->operands[1].tempId()]++;
1709 decrease_uses(ctx, nan_test);
1710 decrease_uses(ctx, cmp);
1711
1712 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1713 Instruction *new_instr;
1714 if (cmp->isVOP3()) {
1715 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1716 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1717 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1718 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1719 new_vop3->clamp = cmp_vop3->clamp;
1720 new_vop3->omod = cmp_vop3->omod;
1721 new_vop3->opsel = cmp_vop3->opsel;
1722 new_instr = new_vop3;
1723 } else {
1724 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1725 }
1726 new_instr->operands[0] = cmp->operands[0];
1727 new_instr->operands[1] = cmp->operands[1];
1728 new_instr->definitions[0] = instr->definitions[0];
1729
1730 ctx.info[instr->definitions[0].tempId()].label = 0;
1731 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
1732
1733 instr.reset(new_instr);
1734
1735 return true;
1736 }
1737
1738 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
1739 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
1740 bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1741 {
1742 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1743 return false;
1744 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1745 return false;
1746
1747 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1748
1749 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1750 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1751
1752 if (!nan_test || !cmp)
1753 return false;
1754
1755 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1756 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
1757 std::swap(nan_test, cmp);
1758 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
1759 return false;
1760
1761 if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
1762 return false;
1763
1764 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1765 return false;
1766 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
1767 return false;
1768
1769 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1770 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1771 if (prop_nan0 != prop_nan1)
1772 return false;
1773
1774 if (nan_test->isVOP3()) {
1775 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(nan_test);
1776 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1777 return false;
1778 }
1779
1780 int constant_operand = -1;
1781 for (unsigned i = 0; i < 2; i++) {
1782 if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
1783 constant_operand = !i;
1784 break;
1785 }
1786 }
1787 if (constant_operand == -1)
1788 return false;
1789
1790 uint32_t constant;
1791 if (cmp->operands[constant_operand].isConstant()) {
1792 constant = cmp->operands[constant_operand].constantValue();
1793 } else if (cmp->operands[constant_operand].isTemp()) {
1794 Temp tmp = cmp->operands[constant_operand].getTemp();
1795 unsigned id = original_temp_id(ctx, tmp);
1796 if (!ctx.info[id].is_constant_or_literal(32))
1797 return false;
1798 constant = ctx.info[id].val;
1799 } else {
1800 return false;
1801 }
1802
1803 float constantf;
1804 memcpy(&constantf, &constant, 4);
1805 if (isnan(constantf))
1806 return false;
1807
1808 if (cmp->operands[0].isTemp())
1809 ctx.uses[cmp->operands[0].tempId()]++;
1810 if (cmp->operands[1].isTemp())
1811 ctx.uses[cmp->operands[1].tempId()]++;
1812 decrease_uses(ctx, nan_test);
1813 decrease_uses(ctx, cmp);
1814
1815 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1816 Instruction *new_instr;
1817 if (cmp->isVOP3()) {
1818 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1819 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1820 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1821 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1822 new_vop3->clamp = cmp_vop3->clamp;
1823 new_vop3->omod = cmp_vop3->omod;
1824 new_vop3->opsel = cmp_vop3->opsel;
1825 new_instr = new_vop3;
1826 } else {
1827 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1828 }
1829 new_instr->operands[0] = cmp->operands[0];
1830 new_instr->operands[1] = cmp->operands[1];
1831 new_instr->definitions[0] = instr->definitions[0];
1832
1833 ctx.info[instr->definitions[0].tempId()].label = 0;
1834 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
1835
1836 instr.reset(new_instr);
1837
1838 return true;
1839 }
1840
1841 /* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */
1842 bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1843 {
1844 if (instr->opcode != aco_opcode::s_not_b64)
1845 return false;
1846 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1847 return false;
1848 if (!instr->operands[0].isTemp())
1849 return false;
1850
1851 Instruction *cmp = follow_operand(ctx, instr->operands[0]);
1852 if (!cmp)
1853 return false;
1854
1855 aco_opcode new_opcode = get_inverse(cmp->opcode);
1856 if (new_opcode == aco_opcode::num_opcodes)
1857 return false;
1858
1859 if (cmp->operands[0].isTemp())
1860 ctx.uses[cmp->operands[0].tempId()]++;
1861 if (cmp->operands[1].isTemp())
1862 ctx.uses[cmp->operands[1].tempId()]++;
1863 decrease_uses(ctx, cmp);
1864
1865 Instruction *new_instr;
1866 if (cmp->isVOP3()) {
1867 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
1868 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1869 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1870 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1871 new_vop3->clamp = cmp_vop3->clamp;
1872 new_vop3->omod = cmp_vop3->omod;
1873 new_vop3->opsel = cmp_vop3->opsel;
1874 new_instr = new_vop3;
1875 } else {
1876 new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
1877 }
1878 new_instr->operands[0] = cmp->operands[0];
1879 new_instr->operands[1] = cmp->operands[1];
1880 new_instr->definitions[0] = instr->definitions[0];
1881
1882 ctx.info[instr->definitions[0].tempId()].label = 0;
1883 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
1884
1885 instr.reset(new_instr);
1886
1887 return true;
1888 }
1889
1890 /* op1(op2(1, 2), 0) if swap = false
1891 * op1(0, op2(1, 2)) if swap = true */
1892 bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
1893 Instruction* op1_instr, bool swap, const char *shuffle_str,
1894 Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel,
1895 bool *op1_clamp, uint8_t *op1_omod,
1896 bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel)
1897 {
1898 /* checks */
1899 if (op1_instr->opcode != op1)
1900 return false;
1901
1902 Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
1903 if (!op2_instr || op2_instr->opcode != op2)
1904 return false;
1905 if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
1906 return false;
1907
1908 VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op1_instr) : NULL;
1909 VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op2_instr) : NULL;
1910
1911 /* don't support inbetween clamp/omod */
1912 if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
1913 return false;
1914
1915 /* get operands and modifiers and check inbetween modifiers */
1916 *op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
1917 *op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
1918
1919 if (inbetween_neg)
1920 *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
1921 else if (op1_vop3 && op1_vop3->neg[swap])
1922 return false;
1923
1924 if (inbetween_abs)
1925 *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
1926 else if (op1_vop3 && op1_vop3->abs[swap])
1927 return false;
1928
1929 if (inbetween_opsel)
1930 *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << swap) : false;
1931 else if (op1_vop3 && op1_vop3->opsel & (1 << swap))
1932 return false;
1933
1934 int shuffle[3];
1935 shuffle[shuffle_str[0] - '0'] = 0;
1936 shuffle[shuffle_str[1] - '0'] = 1;
1937 shuffle[shuffle_str[2] - '0'] = 2;
1938
1939 operands[shuffle[0]] = op1_instr->operands[!swap];
1940 neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
1941 abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
1942 if (op1_vop3 && op1_vop3->opsel & (1 << !swap))
1943 *opsel |= 1 << shuffle[0];
1944
1945 for (unsigned i = 0; i < 2; i++) {
1946 operands[shuffle[i + 1]] = op2_instr->operands[i];
1947 neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
1948 abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
1949 if (op2_vop3 && op2_vop3->opsel & (1 << i))
1950 *opsel |= 1 << shuffle[i + 1];
1951 }
1952
1953 /* check operands */
1954 if (!check_vop3_operands(ctx, 3, operands))
1955 return false;
1956
1957 return true;
1958 }
1959
1960 void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
1961 Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel,
1962 bool clamp, unsigned omod)
1963 {
1964 VOP3A_instruction *new_instr = create_instruction<VOP3A_instruction>(opcode, Format::VOP3A, 3, 1);
1965 memcpy(new_instr->abs, abs, sizeof(bool[3]));
1966 memcpy(new_instr->neg, neg, sizeof(bool[3]));
1967 new_instr->clamp = clamp;
1968 new_instr->omod = omod;
1969 new_instr->opsel = opsel;
1970 new_instr->operands[0] = operands[0];
1971 new_instr->operands[1] = operands[1];
1972 new_instr->operands[2] = operands[2];
1973 new_instr->definitions[0] = instr->definitions[0];
1974 ctx.info[instr->definitions[0].tempId()].label = 0;
1975
1976 instr.reset(new_instr);
1977 }
1978
1979 bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops)
1980 {
1981 for (unsigned swap = 0; swap < 2; swap++) {
1982 if (!((1 << swap) & ops))
1983 continue;
1984
1985 Operand operands[3];
1986 bool neg[3], abs[3], clamp;
1987 uint8_t opsel = 0, omod = 0;
1988 if (match_op3_for_vop3(ctx, instr->opcode, op2,
1989 instr.get(), swap, shuffle,
1990 operands, neg, abs, &opsel,
1991 &clamp, &omod, NULL, NULL, NULL)) {
1992 ctx.uses[instr->operands[swap].tempId()]--;
1993 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
1994 return true;
1995 }
1996 }
1997 return false;
1998 }
1999
2000 bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
2001 {
2002 if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
2003 return true;
2004
2005 /* min(-max(a, b), c) -> min3(-a, -b, c) *
2006 * max(-min(a, b), c) -> max3(-a, -b, c) */
2007 for (unsigned swap = 0; swap < 2; swap++) {
2008 Operand operands[3];
2009 bool neg[3], abs[3], clamp;
2010 uint8_t opsel = 0, omod = 0;
2011 bool inbetween_neg;
2012 if (match_op3_for_vop3(ctx, instr->opcode, opposite,
2013 instr.get(), swap, "012",
2014 operands, neg, abs, &opsel,
2015 &clamp, &omod, &inbetween_neg, NULL, NULL) &&
2016 inbetween_neg) {
2017 ctx.uses[instr->operands[swap].tempId()]--;
2018 neg[1] = true;
2019 neg[2] = true;
2020 create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
2021 return true;
2022 }
2023 }
2024 return false;
2025 }
2026
2027 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
2028 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
2029 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
2030 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
2031 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
2032 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
2033 bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2034 {
2035 /* checks */
2036 if (!instr->operands[0].isTemp())
2037 return false;
2038 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2039 return false;
2040
2041 Instruction *op2_instr = follow_operand(ctx, instr->operands[0]);
2042 if (!op2_instr)
2043 return false;
2044 switch (op2_instr->opcode) {
2045 case aco_opcode::s_and_b32:
2046 case aco_opcode::s_or_b32:
2047 case aco_opcode::s_xor_b32:
2048 case aco_opcode::s_and_b64:
2049 case aco_opcode::s_or_b64:
2050 case aco_opcode::s_xor_b64:
2051 break;
2052 default:
2053 return false;
2054 }
2055
2056 /* create instruction */
2057 std::swap(instr->definitions[0], op2_instr->definitions[0]);
2058 std::swap(instr->definitions[1], op2_instr->definitions[1]);
2059 ctx.uses[instr->operands[0].tempId()]--;
2060 ctx.info[op2_instr->definitions[0].tempId()].label = 0;
2061
2062 switch (op2_instr->opcode) {
2063 case aco_opcode::s_and_b32:
2064 op2_instr->opcode = aco_opcode::s_nand_b32;
2065 break;
2066 case aco_opcode::s_or_b32:
2067 op2_instr->opcode = aco_opcode::s_nor_b32;
2068 break;
2069 case aco_opcode::s_xor_b32:
2070 op2_instr->opcode = aco_opcode::s_xnor_b32;
2071 break;
2072 case aco_opcode::s_and_b64:
2073 op2_instr->opcode = aco_opcode::s_nand_b64;
2074 break;
2075 case aco_opcode::s_or_b64:
2076 op2_instr->opcode = aco_opcode::s_nor_b64;
2077 break;
2078 case aco_opcode::s_xor_b64:
2079 op2_instr->opcode = aco_opcode::s_xnor_b64;
2080 break;
2081 default:
2082 break;
2083 }
2084
2085 return true;
2086 }
2087
2088 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
2089 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
2090 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
2091 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
2092 bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2093 {
2094 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
2095 return false;
2096
2097 for (unsigned i = 0; i < 2; i++) {
2098 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
2099 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64))
2100 continue;
2101 if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
2102 continue;
2103
2104 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2105 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2106 continue;
2107
2108 ctx.uses[instr->operands[i].tempId()]--;
2109 instr->operands[0] = instr->operands[!i];
2110 instr->operands[1] = op2_instr->operands[0];
2111 ctx.info[instr->definitions[0].tempId()].label = 0;
2112
2113 switch (instr->opcode) {
2114 case aco_opcode::s_and_b32:
2115 instr->opcode = aco_opcode::s_andn2_b32;
2116 break;
2117 case aco_opcode::s_or_b32:
2118 instr->opcode = aco_opcode::s_orn2_b32;
2119 break;
2120 case aco_opcode::s_and_b64:
2121 instr->opcode = aco_opcode::s_andn2_b64;
2122 break;
2123 case aco_opcode::s_or_b64:
2124 instr->opcode = aco_opcode::s_orn2_b64;
2125 break;
2126 default:
2127 break;
2128 }
2129
2130 return true;
2131 }
2132 return false;
2133 }
2134
2135 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
2136 bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2137 {
2138 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
2139 return false;
2140
2141 for (unsigned i = 0; i < 2; i++) {
2142 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
2143 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
2144 ctx.uses[op2_instr->definitions[1].tempId()])
2145 continue;
2146 if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
2147 continue;
2148
2149 uint32_t shift = op2_instr->operands[1].constantValue();
2150 if (shift < 1 || shift > 4)
2151 continue;
2152
2153 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2154 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2155 continue;
2156
2157 ctx.uses[instr->operands[i].tempId()]--;
2158 instr->operands[1] = instr->operands[!i];
2159 instr->operands[0] = op2_instr->operands[0];
2160 ctx.info[instr->definitions[0].tempId()].label = 0;
2161
2162 instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32,
2163 aco_opcode::s_lshl2_add_u32,
2164 aco_opcode::s_lshl3_add_u32,
2165 aco_opcode::s_lshl4_add_u32})[shift - 1];
2166
2167 return true;
2168 }
2169 return false;
2170 }
2171
2172 bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
2173 {
2174 if (instr->usesModifiers())
2175 return false;
2176
2177 for (unsigned i = 0; i < 2; i++) {
2178 if (!((1 << i) & ops))
2179 continue;
2180 if (instr->operands[i].isTemp() &&
2181 ctx.info[instr->operands[i].tempId()].is_b2i() &&
2182 ctx.uses[instr->operands[i].tempId()] == 1) {
2183
2184 aco_ptr<Instruction> new_instr;
2185 if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2186 new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
2187 } else if (ctx.program->chip_class >= GFX10 ||
2188 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
2189 new_instr.reset(create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
2190 } else {
2191 return false;
2192 }
2193 ctx.uses[instr->operands[i].tempId()]--;
2194 new_instr->definitions[0] = instr->definitions[0];
2195 new_instr->definitions[1] = instr->definitions.size() == 2 ? instr->definitions[1] :
2196 Definition(ctx.program->allocateId(), ctx.program->lane_mask);
2197 new_instr->definitions[1].setHint(vcc);
2198 new_instr->operands[0] = Operand(0u);
2199 new_instr->operands[1] = instr->operands[!i];
2200 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2201 instr = std::move(new_instr);
2202 ctx.info[instr->definitions[0].tempId()].label = 0;
2203 return true;
2204 }
2205 }
2206
2207 return false;
2208 }
2209
2210 bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
2211 {
2212 switch (op) {
2213 #define MINMAX(type, gfx9) \
2214 case aco_opcode::v_min_##type:\
2215 case aco_opcode::v_max_##type:\
2216 case aco_opcode::v_med3_##type:\
2217 *min = aco_opcode::v_min_##type;\
2218 *max = aco_opcode::v_max_##type;\
2219 *med3 = aco_opcode::v_med3_##type;\
2220 *min3 = aco_opcode::v_min3_##type;\
2221 *max3 = aco_opcode::v_max3_##type;\
2222 *some_gfx9_only = gfx9;\
2223 return true;
2224 MINMAX(f32, false)
2225 MINMAX(u32, false)
2226 MINMAX(i32, false)
2227 MINMAX(f16, true)
2228 MINMAX(u16, true)
2229 MINMAX(i16, true)
2230 #undef MINMAX
2231 default:
2232 return false;
2233 }
2234 }
2235
2236 /* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb
2237 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */
2238 bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
2239 aco_opcode min, aco_opcode max, aco_opcode med)
2240 {
2241 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
2242 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
2243 * minVal > maxVal, which means we can always select it to a v_med3_f32 */
2244 aco_opcode other_op;
2245 if (instr->opcode == min)
2246 other_op = max;
2247 else if (instr->opcode == max)
2248 other_op = min;
2249 else
2250 return false;
2251
2252 for (unsigned swap = 0; swap < 2; swap++) {
2253 Operand operands[3];
2254 bool neg[3], abs[3], clamp;
2255 uint8_t opsel = 0, omod = 0;
2256 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap,
2257 "012", operands, neg, abs, &opsel,
2258 &clamp, &omod, NULL, NULL, NULL)) {
2259 int const0_idx = -1, const1_idx = -1;
2260 uint32_t const0 = 0, const1 = 0;
2261 for (int i = 0; i < 3; i++) {
2262 uint32_t val;
2263 if (operands[i].isConstant()) {
2264 val = operands[i].constantValue();
2265 } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
2266 val = ctx.info[operands[i].tempId()].val;
2267 } else {
2268 continue;
2269 }
2270 if (const0_idx >= 0) {
2271 const1_idx = i;
2272 const1 = val;
2273 } else {
2274 const0_idx = i;
2275 const0 = val;
2276 }
2277 }
2278 if (const0_idx < 0 || const1_idx < 0)
2279 continue;
2280
2281 if (opsel & (1 << const0_idx))
2282 const0 >>= 16;
2283 if (opsel & (1 << const1_idx))
2284 const1 >>= 16;
2285
2286 int lower_idx = const0_idx;
2287 switch (min) {
2288 case aco_opcode::v_min_f32:
2289 case aco_opcode::v_min_f16: {
2290 float const0_f, const1_f;
2291 if (min == aco_opcode::v_min_f32) {
2292 memcpy(&const0_f, &const0, 4);
2293 memcpy(&const1_f, &const1, 4);
2294 } else {
2295 const0_f = _mesa_half_to_float(const0);
2296 const1_f = _mesa_half_to_float(const1);
2297 }
2298 if (abs[const0_idx]) const0_f = fabsf(const0_f);
2299 if (abs[const1_idx]) const1_f = fabsf(const1_f);
2300 if (neg[const0_idx]) const0_f = -const0_f;
2301 if (neg[const1_idx]) const1_f = -const1_f;
2302 lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
2303 break;
2304 }
2305 case aco_opcode::v_min_u32: {
2306 lower_idx = const0 < const1 ? const0_idx : const1_idx;
2307 break;
2308 }
2309 case aco_opcode::v_min_u16: {
2310 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
2311 break;
2312 }
2313 case aco_opcode::v_min_i32: {
2314 int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
2315 int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
2316 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2317 break;
2318 }
2319 case aco_opcode::v_min_i16: {
2320 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
2321 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
2322 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2323 break;
2324 }
2325 default:
2326 break;
2327 }
2328 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
2329
2330 if (instr->opcode == min) {
2331 if (upper_idx != 0 || lower_idx == 0)
2332 return false;
2333 } else {
2334 if (upper_idx == 0 || lower_idx != 0)
2335 return false;
2336 }
2337
2338 ctx.uses[instr->operands[swap].tempId()]--;
2339 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
2340
2341 return true;
2342 }
2343 }
2344
2345 return false;
2346 }
2347
2348
2349 void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2350 {
2351 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2352 instr->opcode == aco_opcode::v_lshrrev_b64 ||
2353 instr->opcode == aco_opcode::v_ashrrev_i64;
2354
2355 /* find candidates and create the set of sgprs already read */
2356 unsigned sgpr_ids[2] = {0, 0};
2357 uint32_t operand_mask = 0;
2358 bool has_literal = false;
2359 for (unsigned i = 0; i < instr->operands.size(); i++) {
2360 if (instr->operands[i].isLiteral())
2361 has_literal = true;
2362 if (!instr->operands[i].isTemp())
2363 continue;
2364 if (instr->operands[i].getTemp().type() == RegType::sgpr) {
2365 if (instr->operands[i].tempId() != sgpr_ids[0])
2366 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
2367 }
2368 ssa_info& info = ctx.info[instr->operands[i].tempId()];
2369 if (info.is_temp() && info.temp.type() == RegType::sgpr)
2370 operand_mask |= 1u << i;
2371 }
2372 unsigned max_sgprs = 1;
2373 if (ctx.program->chip_class >= GFX10 && !is_shift64)
2374 max_sgprs = 2;
2375 if (has_literal)
2376 max_sgprs--;
2377
2378 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2379
2380 /* keep on applying sgprs until there is nothing left to be done */
2381 while (operand_mask) {
2382 uint32_t sgpr_idx = 0;
2383 uint32_t sgpr_info_id = 0;
2384 uint32_t mask = operand_mask;
2385 /* choose a sgpr */
2386 while (mask) {
2387 unsigned i = u_bit_scan(&mask);
2388 uint16_t uses = ctx.uses[instr->operands[i].tempId()];
2389 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
2390 sgpr_idx = i;
2391 sgpr_info_id = instr->operands[i].tempId();
2392 }
2393 }
2394 operand_mask &= ~(1u << sgpr_idx);
2395
2396 /* Applying two sgprs require making it VOP3, so don't do it unless it's
2397 * definitively beneficial.
2398 * TODO: this is too conservative because later the use count could be reduced to 1 */
2399 if (num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3())
2400 break;
2401
2402 Temp sgpr = ctx.info[sgpr_info_id].temp;
2403 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
2404 if (new_sgpr && num_sgprs >= max_sgprs)
2405 continue;
2406
2407 if (sgpr_idx == 0 || instr->isVOP3()) {
2408 instr->operands[sgpr_idx] = Operand(sgpr);
2409 } else if (can_swap_operands(instr)) {
2410 instr->operands[sgpr_idx] = instr->operands[0];
2411 instr->operands[0] = Operand(sgpr);
2412 /* swap bits using a 4-entry LUT */
2413 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
2414 operand_mask = (operand_mask & ~0x3) | swapped;
2415 } else if (can_use_VOP3(ctx, instr)) {
2416 to_VOP3(ctx, instr);
2417 instr->operands[sgpr_idx] = Operand(sgpr);
2418 } else {
2419 continue;
2420 }
2421
2422 if (new_sgpr)
2423 sgpr_ids[num_sgprs++] = sgpr.id();
2424 ctx.uses[sgpr_info_id]--;
2425 ctx.uses[sgpr.id()]++;
2426 }
2427 }
2428
2429 bool apply_omod_clamp_helper(opt_ctx &ctx, aco_ptr<Instruction>& instr, ssa_info& def_info)
2430 {
2431 to_VOP3(ctx, instr);
2432
2433 if (!def_info.is_clamp() && (static_cast<VOP3A_instruction*>(instr.get())->clamp ||
2434 static_cast<VOP3A_instruction*>(instr.get())->omod))
2435 return false;
2436
2437 if (def_info.is_omod2())
2438 static_cast<VOP3A_instruction*>(instr.get())->omod = 1;
2439 else if (def_info.is_omod4())
2440 static_cast<VOP3A_instruction*>(instr.get())->omod = 2;
2441 else if (def_info.is_omod5())
2442 static_cast<VOP3A_instruction*>(instr.get())->omod = 3;
2443 else if (def_info.is_clamp())
2444 static_cast<VOP3A_instruction*>(instr.get())->clamp = true;
2445
2446 return true;
2447 }
2448
2449 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
2450 bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2451 {
2452 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
2453 !instr_info.can_use_output_modifiers[(int)instr->opcode])
2454 return false;
2455
2456 if (!can_use_VOP3(ctx, instr))
2457 return false;
2458
2459 /* omod has no effect if denormals are enabled */
2460 bool can_use_omod = (instr->definitions[0].bytes() == 4 ? block.fp_mode.denorm32 : block.fp_mode.denorm16_64) == 0;
2461 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
2462
2463 uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
2464 if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
2465 return false;
2466 /* if the omod/clamp instruction is dead, then the single user of this
2467 * instruction is a different instruction */
2468 if (!ctx.uses[def_info.instr->definitions[0].tempId()])
2469 return false;
2470
2471 /* MADs/FMAs are created later, so we don't have to update the original add */
2472 assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
2473
2474 if (!apply_omod_clamp_helper(ctx, instr, def_info))
2475 return false;
2476
2477 std::swap(instr->definitions[0], def_info.instr->definitions[0]);
2478 ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
2479 ctx.uses[def_info.instr->definitions[0].tempId()]--;
2480
2481 return true;
2482 }
2483
2484 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
2485 // this would mean that we'd have to fix the instruction uses while value propagation
2486
2487 void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2488 {
2489 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
2490 return;
2491
2492 if (instr->isVALU()) {
2493 if (can_apply_sgprs(instr))
2494 apply_sgprs(ctx, instr);
2495 while (apply_omod_clamp(ctx, block, instr)) ;
2496 }
2497
2498 if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
2499 instr->definitions[0].setHint(vcc);
2500 }
2501
2502 /* TODO: There are still some peephole optimizations that could be done:
2503 * - abs(a - b) -> s_absdiff_i32
2504 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
2505 * - patterns for v_alignbit_b32 and v_alignbyte_b32
2506 * These aren't probably too interesting though.
2507 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
2508 * probably more useful than the previously mentioned optimizations.
2509 * The various comparison optimizations also currently only work with 32-bit
2510 * floats. */
2511
2512 /* neg(mul(a, b)) -> mul(neg(a), b) */
2513 if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) {
2514 Temp val = ctx.info[instr->definitions[0].tempId()].temp;
2515
2516 if (!ctx.info[val.id()].is_mul())
2517 return;
2518
2519 Instruction* mul_instr = ctx.info[val.id()].instr;
2520
2521 if (mul_instr->operands[0].isLiteral())
2522 return;
2523 if (mul_instr->isVOP3() && static_cast<VOP3A_instruction*>(mul_instr)->clamp)
2524 return;
2525
2526 /* convert to mul(neg(a), b) */
2527 ctx.uses[mul_instr->definitions[0].tempId()]--;
2528 Definition def = instr->definitions[0];
2529 /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
2530 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
2531 instr.reset(create_instruction<VOP3A_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
2532 instr->operands[0] = mul_instr->operands[0];
2533 instr->operands[1] = mul_instr->operands[1];
2534 instr->definitions[0] = def;
2535 VOP3A_instruction* new_mul = static_cast<VOP3A_instruction*>(instr.get());
2536 if (mul_instr->isVOP3()) {
2537 VOP3A_instruction* mul = static_cast<VOP3A_instruction*>(mul_instr);
2538 new_mul->neg[0] = mul->neg[0] && !is_abs;
2539 new_mul->neg[1] = mul->neg[1] && !is_abs;
2540 new_mul->abs[0] = mul->abs[0] || is_abs;
2541 new_mul->abs[1] = mul->abs[1] || is_abs;
2542 new_mul->omod = mul->omod;
2543 }
2544 new_mul->neg[0] ^= true;
2545 new_mul->clamp = false;
2546
2547 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
2548 return;
2549 }
2550
2551 /* combine mul+add -> mad */
2552 bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
2553 instr->opcode == aco_opcode::v_sub_f32 ||
2554 instr->opcode == aco_opcode::v_subrev_f32;
2555 bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
2556 instr->opcode == aco_opcode::v_sub_f16 ||
2557 instr->opcode == aco_opcode::v_subrev_f16;
2558 if (mad16 || mad32) {
2559 bool need_fma = mad32 ? (block.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) :
2560 (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
2561 if (need_fma && instr->definitions[0].isPrecise())
2562 return;
2563 if (need_fma && mad32 && !ctx.program->has_fast_fma32)
2564 return;
2565
2566 uint32_t uses_src0 = UINT32_MAX;
2567 uint32_t uses_src1 = UINT32_MAX;
2568 Instruction* mul_instr = nullptr;
2569 unsigned add_op_idx;
2570 /* check if any of the operands is a multiplication */
2571 ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL;
2572 ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL;
2573 if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise()))
2574 uses_src0 = ctx.uses[instr->operands[0].tempId()];
2575 if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise()))
2576 uses_src1 = ctx.uses[instr->operands[1].tempId()];
2577
2578 /* find the 'best' mul instruction to combine with the add */
2579 if (uses_src0 < uses_src1) {
2580 mul_instr = op0_info->instr;
2581 add_op_idx = 1;
2582 } else if (uses_src1 < uses_src0) {
2583 mul_instr = op1_info->instr;
2584 add_op_idx = 0;
2585 } else if (uses_src0 != UINT32_MAX) {
2586 /* tiebreaker: quite random what to pick */
2587 if (op0_info->instr->operands[0].isLiteral()) {
2588 mul_instr = op1_info->instr;
2589 add_op_idx = 0;
2590 } else {
2591 mul_instr = op0_info->instr;
2592 add_op_idx = 1;
2593 }
2594 }
2595 if (mul_instr) {
2596 Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)};
2597 bool neg[3] = {false, false, false};
2598 bool abs[3] = {false, false, false};
2599 unsigned omod = 0;
2600 bool clamp = false;
2601 op[0] = mul_instr->operands[0];
2602 op[1] = mul_instr->operands[1];
2603 op[2] = instr->operands[add_op_idx];
2604 // TODO: would be better to check this before selecting a mul instr?
2605 if (!check_vop3_operands(ctx, 3, op))
2606 return;
2607
2608 if (mul_instr->isVOP3()) {
2609 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr);
2610 neg[0] = vop3->neg[0];
2611 neg[1] = vop3->neg[1];
2612 abs[0] = vop3->abs[0];
2613 abs[1] = vop3->abs[1];
2614 /* we cannot use these modifiers between mul and add */
2615 if (vop3->clamp || vop3->omod)
2616 return;
2617 }
2618
2619 /* convert to mad */
2620 ctx.uses[mul_instr->definitions[0].tempId()]--;
2621 if (ctx.uses[mul_instr->definitions[0].tempId()]) {
2622 if (op[0].isTemp())
2623 ctx.uses[op[0].tempId()]++;
2624 if (op[1].isTemp())
2625 ctx.uses[op[1].tempId()]++;
2626 }
2627
2628 if (instr->isVOP3()) {
2629 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (instr.get());
2630 neg[2] = vop3->neg[add_op_idx];
2631 abs[2] = vop3->abs[add_op_idx];
2632 omod = vop3->omod;
2633 clamp = vop3->clamp;
2634 /* abs of the multiplication result */
2635 if (vop3->abs[1 - add_op_idx]) {
2636 neg[0] = false;
2637 neg[1] = false;
2638 abs[0] = true;
2639 abs[1] = true;
2640 }
2641 /* neg of the multiplication result */
2642 neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx];
2643 }
2644 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
2645 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
2646 else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
2647 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
2648
2649 aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
2650 if (mad16)
2651 mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
2652 (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
2653
2654 aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
2655 for (unsigned i = 0; i < 3; i++)
2656 {
2657 mad->operands[i] = op[i];
2658 mad->neg[i] = neg[i];
2659 mad->abs[i] = abs[i];
2660 }
2661 mad->omod = omod;
2662 mad->clamp = clamp;
2663 mad->definitions[0] = instr->definitions[0];
2664
2665 /* mark this ssa_def to be re-checked for profitability and literals */
2666 ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
2667 ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
2668 instr.reset(mad.release());
2669 return;
2670 }
2671 }
2672 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
2673 else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
2674 for (unsigned i = 0; i < 2; i++) {
2675 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
2676 ctx.uses[instr->operands[i].tempId()] == 1 &&
2677 instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2678 ctx.uses[instr->operands[i].tempId()]--;
2679 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
2680
2681 aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
2682 new_instr->operands[0] = Operand(0u);
2683 new_instr->operands[1] = instr->operands[!i];
2684 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2685 new_instr->definitions[0] = instr->definitions[0];
2686 instr.reset(new_instr.release());
2687 ctx.info[instr->definitions[0].tempId()].label = 0;
2688 return;
2689 }
2690 }
2691 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
2692 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2693 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2694 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2695 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2696 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_or_b32, "120", 1 | 2)) ;
2697 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2);
2698 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
2699 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ;
2700 else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
2701 } else if (instr->opcode == aco_opcode::v_add_u32) {
2702 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
2703 else if (ctx.program->chip_class >= GFX9) {
2704 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2705 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2706 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2707 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2708 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2709 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_add_u32, "120", 1 | 2)) ;
2710 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2);
2711 }
2712 } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
2713 instr->opcode == aco_opcode::v_add_co_u32_e64) {
2714 combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
2715 } else if (instr->opcode == aco_opcode::v_sub_u32 ||
2716 instr->opcode == aco_opcode::v_sub_co_u32 ||
2717 instr->opcode == aco_opcode::v_sub_co_u32_e64) {
2718 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
2719 } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
2720 instr->opcode == aco_opcode::v_subrev_co_u32 ||
2721 instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
2722 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
2723 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
2724 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
2725 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
2726 combine_salu_lshl_add(ctx, instr);
2727 } else if (instr->opcode == aco_opcode::s_not_b32) {
2728 combine_salu_not_bitwise(ctx, instr);
2729 } else if (instr->opcode == aco_opcode::s_not_b64) {
2730 if (combine_inverse_comparison(ctx, instr)) ;
2731 else combine_salu_not_bitwise(ctx, instr);
2732 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
2733 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
2734 if (combine_ordering_test(ctx, instr)) ;
2735 else if (combine_comparison_ordering(ctx, instr)) ;
2736 else if (combine_constant_comparison_ordering(ctx, instr)) ;
2737 else combine_salu_n2(ctx, instr);
2738 } else {
2739 aco_opcode min, max, min3, max3, med3;
2740 bool some_gfx9_only;
2741 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
2742 (!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
2743 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ;
2744 else combine_clamp(ctx, instr, min, max, med3);
2745 }
2746 }
2747 }
2748
2749 bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
2750 {
2751 switch (instr->opcode) {
2752 case aco_opcode::s_and_b32:
2753 case aco_opcode::s_and_b64:
2754 instr->opcode = aco_opcode::s_and_b32;
2755 break;
2756 case aco_opcode::s_or_b32:
2757 case aco_opcode::s_or_b64:
2758 instr->opcode = aco_opcode::s_or_b32;
2759 break;
2760 case aco_opcode::s_xor_b32:
2761 case aco_opcode::s_xor_b64:
2762 instr->opcode = aco_opcode::s_absdiff_i32;
2763 break;
2764 default:
2765 /* Don't transform other instructions. They are very unlikely to appear here. */
2766 return false;
2767 }
2768
2769 for (Operand &op : instr->operands) {
2770 ctx.uses[op.tempId()]--;
2771
2772 if (ctx.info[op.tempId()].is_uniform_bool()) {
2773 /* Just use the uniform boolean temp. */
2774 op.setTemp(ctx.info[op.tempId()].temp);
2775 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
2776 /* Use the SCC definition of the predecessor instruction.
2777 * This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
2778 * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
2779 */
2780 Instruction *pred_instr = ctx.info[op.tempId()].instr;
2781 assert(pred_instr->definitions.size() >= 2);
2782 assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
2783 op.setTemp(pred_instr->definitions[1].getTemp());
2784 } else {
2785 unreachable("Invalid operand on uniform bitwise instruction.");
2786 }
2787
2788 ctx.uses[op.tempId()]++;
2789 }
2790
2791 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
2792 assert(instr->operands[0].regClass() == s1);
2793 assert(instr->operands[1].regClass() == s1);
2794 return true;
2795 }
2796
2797 void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2798 {
2799 const uint32_t threshold = 4;
2800
2801 if (is_dead(ctx.uses, instr.get())) {
2802 instr.reset();
2803 return;
2804 }
2805
2806 /* convert split_vector into a copy or extract_vector if only one definition is ever used */
2807 if (instr->opcode == aco_opcode::p_split_vector) {
2808 unsigned num_used = 0;
2809 unsigned idx = 0;
2810 unsigned split_offset = 0;
2811 for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) {
2812 if (ctx.uses[instr->definitions[i].tempId()]) {
2813 num_used++;
2814 idx = i;
2815 split_offset = offset;
2816 }
2817 }
2818 bool done = false;
2819 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
2820 ctx.uses[instr->operands[0].tempId()] == 1) {
2821 Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
2822
2823 unsigned off = 0;
2824 Operand op;
2825 for (Operand& vec_op : vec->operands) {
2826 if (off == split_offset) {
2827 op = vec_op;
2828 break;
2829 }
2830 off += vec_op.bytes();
2831 }
2832 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
2833 ctx.uses[instr->operands[0].tempId()]--;
2834 for (Operand& vec_op : vec->operands) {
2835 if (vec_op.isTemp())
2836 ctx.uses[vec_op.tempId()]--;
2837 }
2838 if (op.isTemp())
2839 ctx.uses[op.tempId()]++;
2840
2841 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
2842 extract->operands[0] = op;
2843 extract->definitions[0] = instr->definitions[idx];
2844 instr.reset(extract.release());
2845
2846 done = true;
2847 }
2848 }
2849
2850 if (!done && num_used == 1 &&
2851 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
2852 split_offset % instr->definitions[idx].bytes() == 0) {
2853 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
2854 extract->operands[0] = instr->operands[0];
2855 extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes());
2856 extract->definitions[0] = instr->definitions[idx];
2857 instr.reset(extract.release());
2858 }
2859 }
2860
2861 mad_info* mad_info = NULL;
2862 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
2863 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
2864 /* re-check mad instructions */
2865 if (ctx.uses[mad_info->mul_temp_id]) {
2866 ctx.uses[mad_info->mul_temp_id]++;
2867 if (instr->operands[0].isTemp())
2868 ctx.uses[instr->operands[0].tempId()]--;
2869 if (instr->operands[1].isTemp())
2870 ctx.uses[instr->operands[1].tempId()]--;
2871 instr.swap(mad_info->add_instr);
2872 mad_info = NULL;
2873 }
2874 /* check literals */
2875 else if (!instr->usesModifiers()) {
2876 /* FMA can only take literals on GFX10+ */
2877 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
2878 ctx.program->chip_class < GFX10)
2879 return;
2880
2881 bool sgpr_used = false;
2882 uint32_t literal_idx = 0;
2883 uint32_t literal_uses = UINT32_MAX;
2884 for (unsigned i = 0; i < instr->operands.size(); i++)
2885 {
2886 if (instr->operands[i].isConstant() && i > 0) {
2887 literal_uses = UINT32_MAX;
2888 break;
2889 }
2890 if (!instr->operands[i].isTemp())
2891 continue;
2892 unsigned bits = get_operand_size(instr, i);
2893 /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */
2894 if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) {
2895 if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {
2896 literal_uses = ctx.uses[instr->operands[i].tempId()];
2897 literal_idx = i;
2898 } else {
2899 literal_uses = UINT32_MAX;
2900 }
2901 sgpr_used = true;
2902 /* don't break because we still need to check constants */
2903 } else if (!sgpr_used &&
2904 ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
2905 ctx.uses[instr->operands[i].tempId()] < literal_uses) {
2906 literal_uses = ctx.uses[instr->operands[i].tempId()];
2907 literal_idx = i;
2908 }
2909 }
2910
2911 /* Limit the number of literals to apply to not increase the code
2912 * size too much, but always apply literals for v_mad->v_madak
2913 * because both instructions are 64-bit and this doesn't increase
2914 * code size.
2915 * TODO: try to apply the literals earlier to lower the number of
2916 * uses below threshold
2917 */
2918 if (literal_uses < threshold || literal_idx == 2) {
2919 ctx.uses[instr->operands[literal_idx].tempId()]--;
2920 mad_info->check_literal = true;
2921 mad_info->literal_idx = literal_idx;
2922 return;
2923 }
2924 }
2925 }
2926
2927 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
2928 if (instr->format == Format::PSEUDO_BRANCH &&
2929 instr->operands.size() &&
2930 instr->operands[0].isTemp()) {
2931 ctx.info[instr->operands[0].tempId()].set_scc_needed();
2932 return;
2933 } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
2934 instr->opcode == aco_opcode::s_cselect_b32) &&
2935 instr->operands[2].isTemp()) {
2936 ctx.info[instr->operands[2].tempId()].set_scc_needed();
2937 }
2938
2939 /* check for literals */
2940 if (!instr->isSALU() && !instr->isVALU())
2941 return;
2942
2943 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
2944 if (instr->definitions.size() &&
2945 ctx.uses[instr->definitions[0].tempId()] == 0 &&
2946 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
2947 bool transform_done = to_uniform_bool_instr(ctx, instr);
2948
2949 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
2950 /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
2951 uint32_t def0_id = instr->definitions[0].getTemp().id();
2952 uint32_t def1_id = instr->definitions[1].getTemp().id();
2953 instr->definitions[0].setTemp(Temp(def1_id, s1));
2954 instr->definitions[1].setTemp(Temp(def0_id, s1));
2955 }
2956
2957 return;
2958 }
2959
2960 if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
2961 return; /* some encodings can't ever take literals */
2962
2963 /* we do not apply the literals yet as we don't know if it is profitable */
2964 Operand current_literal(s1);
2965
2966 unsigned literal_id = 0;
2967 unsigned literal_uses = UINT32_MAX;
2968 Operand literal(s1);
2969 unsigned num_operands = 1;
2970 if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr)))
2971 num_operands = instr->operands.size();
2972 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
2973 else if (instr->isVALU() && instr->operands.size() >= 3)
2974 return;
2975
2976 unsigned sgpr_ids[2] = {0, 0};
2977 bool is_literal_sgpr = false;
2978 uint32_t mask = 0;
2979
2980 /* choose a literal to apply */
2981 for (unsigned i = 0; i < num_operands; i++) {
2982 Operand op = instr->operands[i];
2983 unsigned bits = get_operand_size(instr, i);
2984
2985 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
2986 op.tempId() != sgpr_ids[0])
2987 sgpr_ids[!!sgpr_ids[0]] = op.tempId();
2988
2989 if (op.isLiteral()) {
2990 current_literal = op;
2991 continue;
2992 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
2993 continue;
2994 }
2995
2996 if (!alu_can_accept_constant(instr->opcode, i))
2997 continue;
2998
2999 if (ctx.uses[op.tempId()] < literal_uses) {
3000 is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
3001 mask = 0;
3002 literal = Operand(ctx.info[op.tempId()].val);
3003 literal_uses = ctx.uses[op.tempId()];
3004 literal_id = op.tempId();
3005 }
3006
3007 mask |= (op.tempId() == literal_id) << i;
3008 }
3009
3010
3011 /* don't go over the constant bus limit */
3012 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
3013 instr->opcode == aco_opcode::v_lshrrev_b64 ||
3014 instr->opcode == aco_opcode::v_ashrrev_i64;
3015 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
3016 if (ctx.program->chip_class >= GFX10 && !is_shift64)
3017 const_bus_limit = 2;
3018
3019 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
3020 if (num_sgprs == const_bus_limit && !is_literal_sgpr)
3021 return;
3022
3023 if (literal_id && literal_uses < threshold &&
3024 (current_literal.isUndefined() ||
3025 (current_literal.size() == literal.size() &&
3026 current_literal.constantValue() == literal.constantValue()))) {
3027 /* mark the literal to be applied */
3028 while (mask) {
3029 unsigned i = u_bit_scan(&mask);
3030 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
3031 ctx.uses[instr->operands[i].tempId()]--;
3032 }
3033 }
3034 }
3035
3036
3037 void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
3038 {
3039 /* Cleanup Dead Instructions */
3040 if (!instr)
3041 return;
3042
3043 /* apply literals on MAD */
3044 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
3045 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
3046 if (info->check_literal &&
3047 (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
3048 aco_ptr<Instruction> new_mad;
3049
3050 aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
3051 if (instr->opcode == aco_opcode::v_fma_f32)
3052 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
3053 else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
3054 new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
3055 else if (instr->opcode == aco_opcode::v_fma_f16)
3056 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
3057
3058 new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
3059 if (info->literal_idx == 2) { /* add literal -> madak */
3060 new_mad->operands[0] = instr->operands[0];
3061 new_mad->operands[1] = instr->operands[1];
3062 } else { /* mul literal -> madmk */
3063 new_mad->operands[0] = instr->operands[1 - info->literal_idx];
3064 new_mad->operands[1] = instr->operands[2];
3065 }
3066 new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val);
3067 new_mad->definitions[0] = instr->definitions[0];
3068 ctx.instructions.emplace_back(std::move(new_mad));
3069 return;
3070 }
3071 }
3072
3073 /* apply literals on other SALU/VALU */
3074 if (instr->isSALU() || instr->isVALU()) {
3075 for (unsigned i = 0; i < instr->operands.size(); i++) {
3076 Operand op = instr->operands[i];
3077 unsigned bits = get_operand_size(instr, i);
3078 if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
3079 Operand literal(ctx.info[op.tempId()].val);
3080 if (instr->isVALU() && i > 0)
3081 to_VOP3(ctx, instr);
3082 instr->operands[i] = literal;
3083 }
3084 }
3085 }
3086
3087 ctx.instructions.emplace_back(std::move(instr));
3088 }
3089
3090
3091 void optimize(Program* program)
3092 {
3093 opt_ctx ctx;
3094 ctx.program = program;
3095 std::vector<ssa_info> info(program->peekAllocationId());
3096 ctx.info = info.data();
3097
3098 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
3099 for (Block& block : program->blocks) {
3100 for (aco_ptr<Instruction>& instr : block.instructions)
3101 label_instruction(ctx, block, instr);
3102 }
3103
3104 ctx.uses = dead_code_analysis(program);
3105
3106 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
3107 for (Block& block : program->blocks) {
3108 for (aco_ptr<Instruction>& instr : block.instructions)
3109 combine_instruction(ctx, block, instr);
3110 }
3111
3112 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
3113 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) {
3114 Block* block = &(*it);
3115 for (std::vector<aco_ptr<Instruction>>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it)
3116 select_instruction(ctx, *it);
3117 }
3118
3119 /* 4. Add literals to instructions */
3120 for (Block& block : program->blocks) {
3121 ctx.instructions.clear();
3122 for (aco_ptr<Instruction>& instr : block.instructions)
3123 apply_literals(ctx, instr);
3124 block.instructions.swap(ctx.instructions);
3125 }
3126
3127 }
3128
3129 }