37564b7e993fe60fe5814b25e198ab503c173e91
[mesa.git] / src / amd / compiler / aco_optimizer.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <algorithm>
29 #include <math.h>
30
31 #include "aco_ir.h"
32 #include "util/half_float.h"
33 #include "util/u_math.h"
34
35 namespace aco {
36
37 /**
38 * The optimizer works in 4 phases:
39 * (1) The first pass collects information for each ssa-def,
40 * propagates reg->reg operands of the same type, inline constants
41 * and neg/abs input modifiers.
42 * (2) The second pass combines instructions like mad, omod, clamp and
43 * propagates sgpr's on VALU instructions.
44 * This pass depends on information collected in the first pass.
45 * (3) The third pass goes backwards, and selects instructions,
46 * i.e. decides if a mad instruction is profitable and eliminates dead code.
47 * (4) The fourth pass cleans up the sequence: literals get applied and dead
48 * instructions are removed from the sequence.
49 */
50
51
52 struct mad_info {
53 aco_ptr<Instruction> add_instr;
54 uint32_t mul_temp_id;
55 uint32_t literal_idx;
56 bool check_literal;
57
58 mad_info(aco_ptr<Instruction> instr, uint32_t id)
59 : add_instr(std::move(instr)), mul_temp_id(id), check_literal(false) {}
60 };
61
62 enum Label {
63 label_vec = 1 << 0,
64 label_constant = 1 << 1,
65 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
66 * 32-bit operations but this shouldn't cause any issues because we don't
67 * look through any conversions */
68 label_abs = 1 << 2,
69 label_neg = 1 << 3,
70 label_mul = 1 << 4,
71 label_temp = 1 << 5,
72 label_literal = 1 << 6,
73 label_mad = 1 << 7,
74 label_omod2 = 1 << 8,
75 label_omod4 = 1 << 9,
76 label_omod5 = 1 << 10,
77 label_omod_success = 1 << 11,
78 label_clamp = 1 << 12,
79 label_clamp_success = 1 << 13,
80 label_undefined = 1 << 14,
81 label_vcc = 1 << 15,
82 label_b2f = 1 << 16,
83 label_add_sub = 1 << 17,
84 label_bitwise = 1 << 18,
85 label_minmax = 1 << 19,
86 label_fcmp = 1 << 20,
87 label_uniform_bool = 1 << 21,
88 label_constant_64bit = 1 << 22,
89 label_uniform_bitwise = 1 << 23,
90 label_scc_invert = 1 << 24,
91 label_vcc_hint = 1 << 25,
92 label_scc_needed = 1 << 26,
93 label_b2i = 1 << 27,
94 };
95
96 static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success |
97 label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp;
98 static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
99 label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert | label_b2i;
100 static constexpr uint32_t val_labels = label_constant | label_constant_64bit | label_literal | label_mad;
101
102 struct ssa_info {
103 uint32_t val;
104 union {
105 Temp temp;
106 Instruction* instr;
107 };
108 uint32_t label;
109
110 ssa_info() : label(0) {}
111
112 void add_label(Label new_label)
113 {
114 /* Since all labels which use "instr" use it for the same thing
115 * (indicating the defining instruction), there is no need to clear
116 * any other instr labels. */
117 if (new_label & instr_labels)
118 label &= ~temp_labels; /* instr and temp alias */
119
120 if (new_label & temp_labels) {
121 label &= ~temp_labels;
122 label &= ~instr_labels; /* instr and temp alias */
123 }
124
125 if (new_label & val_labels)
126 label &= ~val_labels;
127
128 label |= new_label;
129 }
130
131 void set_vec(Instruction* vec)
132 {
133 add_label(label_vec);
134 instr = vec;
135 }
136
137 bool is_vec()
138 {
139 return label & label_vec;
140 }
141
142 void set_constant(uint32_t constant)
143 {
144 add_label(label_constant);
145 val = constant;
146 }
147
148 bool is_constant()
149 {
150 return label & label_constant;
151 }
152
153 void set_constant_64bit(uint32_t constant)
154 {
155 add_label(label_constant_64bit);
156 val = constant;
157 }
158
159 bool is_constant_64bit()
160 {
161 return label & label_constant_64bit;
162 }
163
164 void set_abs(Temp abs_temp)
165 {
166 add_label(label_abs);
167 temp = abs_temp;
168 }
169
170 bool is_abs()
171 {
172 return label & label_abs;
173 }
174
175 void set_neg(Temp neg_temp)
176 {
177 add_label(label_neg);
178 temp = neg_temp;
179 }
180
181 bool is_neg()
182 {
183 return label & label_neg;
184 }
185
186 void set_neg_abs(Temp neg_abs_temp)
187 {
188 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
189 temp = neg_abs_temp;
190 }
191
192 void set_mul(Instruction* mul)
193 {
194 add_label(label_mul);
195 instr = mul;
196 }
197
198 bool is_mul()
199 {
200 return label & label_mul;
201 }
202
203 void set_temp(Temp tmp)
204 {
205 add_label(label_temp);
206 temp = tmp;
207 }
208
209 bool is_temp()
210 {
211 return label & label_temp;
212 }
213
214 void set_literal(uint32_t lit)
215 {
216 add_label(label_literal);
217 val = lit;
218 }
219
220 bool is_literal()
221 {
222 return label & label_literal;
223 }
224
225 void set_mad(Instruction* mad, uint32_t mad_info_idx)
226 {
227 add_label(label_mad);
228 val = mad_info_idx;
229 instr = mad;
230 }
231
232 bool is_mad()
233 {
234 return label & label_mad;
235 }
236
237 void set_omod2(Temp def)
238 {
239 add_label(label_omod2);
240 temp = def;
241 }
242
243 bool is_omod2()
244 {
245 return label & label_omod2;
246 }
247
248 void set_omod4(Temp def)
249 {
250 add_label(label_omod4);
251 temp = def;
252 }
253
254 bool is_omod4()
255 {
256 return label & label_omod4;
257 }
258
259 void set_omod5(Temp def)
260 {
261 add_label(label_omod5);
262 temp = def;
263 }
264
265 bool is_omod5()
266 {
267 return label & label_omod5;
268 }
269
270 void set_omod_success(Instruction* omod_instr)
271 {
272 add_label(label_omod_success);
273 instr = omod_instr;
274 }
275
276 bool is_omod_success()
277 {
278 return label & label_omod_success;
279 }
280
281 void set_clamp(Temp def)
282 {
283 add_label(label_clamp);
284 temp = def;
285 }
286
287 bool is_clamp()
288 {
289 return label & label_clamp;
290 }
291
292 void set_clamp_success(Instruction* clamp_instr)
293 {
294 add_label(label_clamp_success);
295 instr = clamp_instr;
296 }
297
298 bool is_clamp_success()
299 {
300 return label & label_clamp_success;
301 }
302
303 void set_undefined()
304 {
305 add_label(label_undefined);
306 }
307
308 bool is_undefined()
309 {
310 return label & label_undefined;
311 }
312
313 void set_vcc(Temp vcc)
314 {
315 add_label(label_vcc);
316 temp = vcc;
317 }
318
319 bool is_vcc()
320 {
321 return label & label_vcc;
322 }
323
324 bool is_constant_or_literal()
325 {
326 return is_constant() || is_literal();
327 }
328
329 void set_b2f(Temp val)
330 {
331 add_label(label_b2f);
332 temp = val;
333 }
334
335 bool is_b2f()
336 {
337 return label & label_b2f;
338 }
339
340 void set_add_sub(Instruction *add_sub_instr)
341 {
342 add_label(label_add_sub);
343 instr = add_sub_instr;
344 }
345
346 bool is_add_sub()
347 {
348 return label & label_add_sub;
349 }
350
351 void set_bitwise(Instruction *bitwise_instr)
352 {
353 add_label(label_bitwise);
354 instr = bitwise_instr;
355 }
356
357 bool is_bitwise()
358 {
359 return label & label_bitwise;
360 }
361
362 void set_uniform_bitwise()
363 {
364 add_label(label_uniform_bitwise);
365 }
366
367 bool is_uniform_bitwise()
368 {
369 return label & label_uniform_bitwise;
370 }
371
372 void set_minmax(Instruction *minmax_instr)
373 {
374 add_label(label_minmax);
375 instr = minmax_instr;
376 }
377
378 bool is_minmax()
379 {
380 return label & label_minmax;
381 }
382
383 void set_fcmp(Instruction *fcmp_instr)
384 {
385 add_label(label_fcmp);
386 instr = fcmp_instr;
387 }
388
389 bool is_fcmp()
390 {
391 return label & label_fcmp;
392 }
393
394 void set_scc_needed()
395 {
396 add_label(label_scc_needed);
397 }
398
399 bool is_scc_needed()
400 {
401 return label & label_scc_needed;
402 }
403
404 void set_scc_invert(Temp scc_inv)
405 {
406 add_label(label_scc_invert);
407 temp = scc_inv;
408 }
409
410 bool is_scc_invert()
411 {
412 return label & label_scc_invert;
413 }
414
415 void set_uniform_bool(Temp uniform_bool)
416 {
417 add_label(label_uniform_bool);
418 temp = uniform_bool;
419 }
420
421 bool is_uniform_bool()
422 {
423 return label & label_uniform_bool;
424 }
425
426 void set_vcc_hint()
427 {
428 add_label(label_vcc_hint);
429 }
430
431 bool is_vcc_hint()
432 {
433 return label & label_vcc_hint;
434 }
435
436 void set_b2i(Temp val)
437 {
438 add_label(label_b2i);
439 temp = val;
440 }
441
442 bool is_b2i()
443 {
444 return label & label_b2i;
445 }
446
447 };
448
449 struct opt_ctx {
450 Program* program;
451 std::vector<aco_ptr<Instruction>> instructions;
452 ssa_info* info;
453 std::pair<uint32_t,Temp> last_literal;
454 std::vector<mad_info> mad_infos;
455 std::vector<uint16_t> uses;
456 };
457
458 bool can_swap_operands(aco_ptr<Instruction>& instr)
459 {
460 if (instr->operands[0].isConstant() ||
461 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
462 return false;
463
464 switch (instr->opcode) {
465 case aco_opcode::v_add_f32:
466 case aco_opcode::v_mul_f32:
467 case aco_opcode::v_or_b32:
468 case aco_opcode::v_and_b32:
469 case aco_opcode::v_xor_b32:
470 case aco_opcode::v_max_f32:
471 case aco_opcode::v_min_f32:
472 case aco_opcode::v_max_i32:
473 case aco_opcode::v_min_i32:
474 case aco_opcode::v_max_u32:
475 case aco_opcode::v_min_u32:
476 case aco_opcode::v_cmp_eq_f32:
477 case aco_opcode::v_cmp_lg_f32:
478 return true;
479 case aco_opcode::v_sub_f32:
480 instr->opcode = aco_opcode::v_subrev_f32;
481 return true;
482 case aco_opcode::v_cmp_lt_f32:
483 instr->opcode = aco_opcode::v_cmp_gt_f32;
484 return true;
485 case aco_opcode::v_cmp_ge_f32:
486 instr->opcode = aco_opcode::v_cmp_le_f32;
487 return true;
488 case aco_opcode::v_cmp_lt_i32:
489 instr->opcode = aco_opcode::v_cmp_gt_i32;
490 return true;
491 default:
492 return false;
493 }
494 }
495
496 bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
497 {
498 if (instr->isVOP3())
499 return true;
500
501 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
502 return false;
503
504 if (instr->isDPP() || instr->isSDWA())
505 return false;
506
507 return instr->opcode != aco_opcode::v_madmk_f32 &&
508 instr->opcode != aco_opcode::v_madak_f32 &&
509 instr->opcode != aco_opcode::v_madmk_f16 &&
510 instr->opcode != aco_opcode::v_madak_f16 &&
511 instr->opcode != aco_opcode::v_fmamk_f32 &&
512 instr->opcode != aco_opcode::v_fmaak_f32 &&
513 instr->opcode != aco_opcode::v_fmamk_f16 &&
514 instr->opcode != aco_opcode::v_fmaak_f16 &&
515 instr->opcode != aco_opcode::v_readlane_b32 &&
516 instr->opcode != aco_opcode::v_writelane_b32 &&
517 instr->opcode != aco_opcode::v_readfirstlane_b32;
518 }
519
520 bool can_apply_sgprs(aco_ptr<Instruction>& instr)
521 {
522 return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
523 instr->opcode != aco_opcode::v_readlane_b32 &&
524 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
525 instr->opcode != aco_opcode::v_writelane_b32 &&
526 instr->opcode != aco_opcode::v_writelane_b32_e64;
527 }
528
529 void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
530 {
531 if (instr->isVOP3())
532 return;
533
534 aco_ptr<Instruction> tmp = std::move(instr);
535 Format format = asVOP3(tmp->format);
536 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
537 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
538 for (unsigned i = 0; i < instr->definitions.size(); i++) {
539 instr->definitions[i] = tmp->definitions[i];
540 if (instr->definitions[i].isTemp()) {
541 ssa_info& info = ctx.info[instr->definitions[i].tempId()];
542 if (info.label & instr_labels && info.instr == tmp.get())
543 info.instr = instr.get();
544 }
545 }
546 }
547
548 /* only covers special cases */
549 bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
550 {
551 switch (opcode) {
552 case aco_opcode::v_interp_p2_f32:
553 case aco_opcode::v_mac_f32:
554 case aco_opcode::v_writelane_b32:
555 case aco_opcode::v_writelane_b32_e64:
556 case aco_opcode::v_cndmask_b32:
557 return operand != 2;
558 case aco_opcode::s_addk_i32:
559 case aco_opcode::s_mulk_i32:
560 case aco_opcode::p_wqm:
561 case aco_opcode::p_extract_vector:
562 case aco_opcode::p_split_vector:
563 case aco_opcode::v_readlane_b32:
564 case aco_opcode::v_readlane_b32_e64:
565 case aco_opcode::v_readfirstlane_b32:
566 return operand != 0;
567 default:
568 return true;
569 }
570 }
571
572 bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
573 {
574 if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
575 instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
576 return operand != 1;
577 return true;
578 }
579
580 /* check constant bus and literal limitations */
581 bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
582 {
583 int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
584 Operand literal32(s1);
585 Operand literal64(s2);
586 unsigned num_sgprs = 0;
587 unsigned sgpr[] = {0, 0};
588
589 for (unsigned i = 0; i < num_operands; i++) {
590 Operand op = operands[i];
591
592 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
593 /* two reads of the same SGPR count as 1 to the limit */
594 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
595 if (num_sgprs < 2)
596 sgpr[num_sgprs++] = op.tempId();
597 limit--;
598 if (limit < 0)
599 return false;
600 }
601 } else if (op.isLiteral()) {
602 if (ctx.program->chip_class < GFX10)
603 return false;
604
605 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
606 return false;
607 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
608 return false;
609
610 /* Any number of 32-bit literals counts as only 1 to the limit. Same
611 * (but separately) for 64-bit literals. */
612 if (op.size() == 1 && literal32.isUndefined()) {
613 limit--;
614 literal32 = op;
615 } else if (op.size() == 2 && literal64.isUndefined()) {
616 limit--;
617 literal64 = op;
618 }
619
620 if (limit < 0)
621 return false;
622 }
623 }
624
625 return true;
626 }
627
628 bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
629 {
630 Operand op = instr->operands[op_index];
631
632 if (!op.isTemp())
633 return false;
634 Temp tmp = op.getTemp();
635 if (!ctx.info[tmp.id()].is_add_sub())
636 return false;
637
638 Instruction *add_instr = ctx.info[tmp.id()].instr;
639
640 switch (add_instr->opcode) {
641 case aco_opcode::v_add_u32:
642 case aco_opcode::v_add_co_u32:
643 case aco_opcode::v_add_co_u32_e64:
644 case aco_opcode::s_add_i32:
645 case aco_opcode::s_add_u32:
646 break;
647 default:
648 return false;
649 }
650
651 if (add_instr->usesModifiers())
652 return false;
653
654 for (unsigned i = 0; i < 2; i++) {
655 if (add_instr->operands[i].isConstant()) {
656 *offset = add_instr->operands[i].constantValue();
657 } else if (add_instr->operands[i].isTemp() &&
658 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal()) {
659 *offset = ctx.info[add_instr->operands[i].tempId()].val;
660 } else {
661 continue;
662 }
663 if (!add_instr->operands[!i].isTemp())
664 continue;
665
666 uint32_t offset2 = 0;
667 if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) {
668 *offset += offset2;
669 } else {
670 *base = add_instr->operands[!i].getTemp();
671 }
672 return true;
673 }
674
675 return false;
676 }
677
678 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
679 {
680 if (instr->format == Format::PSEUDO)
681 return instr->operands[index].bytes() * 8u;
682 else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32)
683 return index == 2 ? 64 : 32;
684 else if (instr->isVALU() || instr->isSALU())
685 return instr_info.operand_size[(int)instr->opcode];
686 else
687 return 0;
688 }
689
690 Operand get_constant_op(opt_ctx &ctx, uint32_t val, bool is64bit = false)
691 {
692 // TODO: this functions shouldn't be needed if we store Operand instead of value.
693 Operand op(val, is64bit);
694 if (val == 0x3e22f983 && ctx.program->chip_class >= GFX8)
695 op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */
696 return op;
697 }
698
699 bool fixed_to_exec(Operand op)
700 {
701 return op.isFixed() && op.physReg() == exec;
702 }
703
704 void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
705 {
706 if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
707 ASSERTED bool all_const = false;
708 for (Operand& op : instr->operands)
709 all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal());
710 perfwarn(all_const, "All instruction operands are constant", instr.get());
711 }
712
713 for (unsigned i = 0; i < instr->operands.size(); i++)
714 {
715 if (!instr->operands[i].isTemp())
716 continue;
717
718 ssa_info info = ctx.info[instr->operands[i].tempId()];
719 /* propagate undef */
720 if (info.is_undefined() && is_phi(instr))
721 instr->operands[i] = Operand(instr->operands[i].regClass());
722 /* propagate reg->reg of same type */
723 if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
724 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
725 info = ctx.info[info.temp.id()];
726 }
727
728 /* SALU / PSEUDO: propagate inline constants */
729 if (instr->isSALU() || instr->format == Format::PSEUDO) {
730 bool is_subdword = false;
731 // TODO: optimize SGPR and constant propagation for subdword pseudo instructions on gfx9+
732 if (instr->format == Format::PSEUDO) {
733 is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(),
734 [] (const Definition& def) { return def.regClass().is_subdword();});
735 is_subdword = is_subdword || std::any_of(instr->operands.begin(), instr->operands.end(),
736 [] (const Operand& op) { return op.hasRegClass() && op.regClass().is_subdword();});
737 if (is_subdword)
738 continue;
739 }
740
741 if (info.is_temp() && info.temp.type() == RegType::sgpr) {
742 instr->operands[i].setTemp(info.temp);
743 info = ctx.info[info.temp.id()];
744 } else if (info.is_temp() && info.temp.type() == RegType::vgpr) {
745 /* propagate vgpr if it can take it */
746 switch (instr->opcode) {
747 case aco_opcode::p_create_vector:
748 case aco_opcode::p_split_vector:
749 case aco_opcode::p_extract_vector:
750 case aco_opcode::p_phi: {
751 const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(),
752 [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;});
753 if (all_vgpr) {
754 instr->operands[i] = Operand(info.temp);
755 info = ctx.info[info.temp.id()];
756 }
757 break;
758 }
759 default:
760 break;
761 }
762 }
763 if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) &&
764 !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
765 instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit());
766 continue;
767 }
768 }
769
770 /* VALU: propagate neg, abs & inline constants */
771 else if (instr->isVALU()) {
772 if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
773 instr->operands[i].setTemp(info.temp);
774 info = ctx.info[info.temp.id()];
775 }
776
777 /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */
778 unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
779 can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
780
781 if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
782 if (!instr->isDPP())
783 to_VOP3(ctx, instr);
784 instr->operands[i] = Operand(info.temp);
785 if (instr->isDPP())
786 static_cast<DPP_instruction*>(instr.get())->abs[i] = true;
787 else
788 static_cast<VOP3A_instruction*>(instr.get())->abs[i] = true;
789 }
790 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
791 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
792 instr->operands[i].setTemp(info.temp);
793 continue;
794 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
795 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
796 instr->operands[i].setTemp(info.temp);
797 continue;
798 } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
799 if (!instr->isDPP())
800 to_VOP3(ctx, instr);
801 instr->operands[i].setTemp(info.temp);
802 if (instr->isDPP())
803 static_cast<DPP_instruction*>(instr.get())->neg[i] = true;
804 else
805 static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true;
806 continue;
807 }
808 if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) {
809 Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit());
810 perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
811 if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
812 instr->operands[i] = op;
813 continue;
814 } else if (!instr->isVOP3() && can_swap_operands(instr)) {
815 instr->operands[i] = instr->operands[0];
816 instr->operands[0] = op;
817 continue;
818 } else if (can_use_VOP3(ctx, instr)) {
819 to_VOP3(ctx, instr);
820 instr->operands[i] = op;
821 continue;
822 }
823 }
824 }
825
826 /* MUBUF: propagate constants and combine additions */
827 else if (instr->format == Format::MUBUF) {
828 MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
829 Temp base;
830 uint32_t offset;
831 while (info.is_temp())
832 info = ctx.info[info.temp.id()];
833
834 if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
835 assert(!mubuf->idxen);
836 instr->operands[1] = Operand(v1);
837 mubuf->offset += info.val;
838 mubuf->offen = false;
839 continue;
840 } else if (i == 2 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
841 instr->operands[2] = Operand((uint32_t) 0);
842 mubuf->offset += info.val;
843 continue;
844 } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
845 assert(!mubuf->idxen);
846 instr->operands[1].setTemp(base);
847 mubuf->offset += offset;
848 continue;
849 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
850 instr->operands[i].setTemp(base);
851 mubuf->offset += offset;
852 continue;
853 }
854 }
855
856 /* DS: combine additions */
857 else if (instr->format == Format::DS) {
858
859 DS_instruction *ds = static_cast<DS_instruction *>(instr.get());
860 Temp base;
861 uint32_t offset;
862 bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
863 if (has_usable_ds_offset &&
864 i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) &&
865 base.regClass() == instr->operands[i].regClass() &&
866 instr->opcode != aco_opcode::ds_swizzle_b32) {
867 if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
868 instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
869 unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3;
870 unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2;
871
872 if ((offset & mask) == 0 &&
873 ds->offset0 + (offset >> shifts) <= 255 &&
874 ds->offset1 + (offset >> shifts) <= 255) {
875 instr->operands[i].setTemp(base);
876 ds->offset0 += offset >> shifts;
877 ds->offset1 += offset >> shifts;
878 }
879 } else {
880 if (ds->offset0 + offset <= 65535) {
881 instr->operands[i].setTemp(base);
882 ds->offset0 += offset;
883 }
884 }
885 }
886 }
887
888 /* SMEM: propagate constants and combine additions */
889 else if (instr->format == Format::SMEM) {
890
891 SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr.get());
892 Temp base;
893 uint32_t offset;
894 if (i == 1 && info.is_constant_or_literal() &&
895 ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
896 (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
897 (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
898 instr->operands[i] = Operand(info.val);
899 continue;
900 } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
901 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
902 if (soe &&
903 (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal() ||
904 ctx.info[smem->operands.back().tempId()].val != 0)) {
905 continue;
906 }
907 if (soe) {
908 smem->operands[1] = Operand(offset);
909 smem->operands.back() = Operand(base);
910 } else {
911 SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size());
912 new_instr->operands[0] = smem->operands[0];
913 new_instr->operands[1] = Operand(offset);
914 if (smem->definitions.empty())
915 new_instr->operands[2] = smem->operands[2];
916 new_instr->operands.back() = Operand(base);
917 if (!smem->definitions.empty())
918 new_instr->definitions[0] = smem->definitions[0];
919 new_instr->can_reorder = smem->can_reorder;
920 new_instr->barrier = smem->barrier;
921 new_instr->glc = smem->glc;
922 new_instr->dlc = smem->dlc;
923 new_instr->nv = smem->nv;
924 new_instr->disable_wqm = smem->disable_wqm;
925 instr.reset(new_instr);
926 smem = static_cast<SMEM_instruction *>(instr.get());
927 }
928 continue;
929 }
930 }
931
932 else if (instr->format == Format::PSEUDO_BRANCH) {
933 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
934 /* Flip the branch instruction to get rid of the scc_invert instruction */
935 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
936 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
937 }
938 }
939 }
940
941 /* if this instruction doesn't define anything, return */
942 if (instr->definitions.empty())
943 return;
944
945 switch (instr->opcode) {
946 case aco_opcode::p_create_vector: {
947 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
948 instr->operands[0].regClass() == instr->definitions[0].regClass();
949 if (copy_prop) {
950 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
951 break;
952 }
953
954 unsigned num_ops = instr->operands.size();
955 for (const Operand& op : instr->operands) {
956 if (op.isTemp() && ctx.info[op.tempId()].is_vec())
957 num_ops += ctx.info[op.tempId()].instr->operands.size() - 1;
958 }
959 if (num_ops != instr->operands.size()) {
960 aco_ptr<Instruction> old_vec = std::move(instr);
961 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_ops, 1));
962 instr->definitions[0] = old_vec->definitions[0];
963 unsigned k = 0;
964 for (Operand& old_op : old_vec->operands) {
965 if (old_op.isTemp() && ctx.info[old_op.tempId()].is_vec()) {
966 for (unsigned j = 0; j < ctx.info[old_op.tempId()].instr->operands.size(); j++) {
967 Operand op = ctx.info[old_op.tempId()].instr->operands[j];
968 if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
969 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
970 op.setTemp(ctx.info[op.tempId()].temp);
971 instr->operands[k++] = op;
972 }
973 } else {
974 instr->operands[k++] = old_op;
975 }
976 }
977 assert(k == num_ops);
978 }
979
980 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
981 break;
982 }
983 case aco_opcode::p_split_vector: {
984 if (!ctx.info[instr->operands[0].tempId()].is_vec())
985 break;
986 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
987 unsigned split_offset = 0;
988 unsigned vec_offset = 0;
989 unsigned vec_index = 0;
990 for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) {
991 while (vec_offset < split_offset && vec_index < vec->operands.size())
992 vec_offset += vec->operands[vec_index++].bytes();
993
994 if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
995 continue;
996
997 Operand vec_op = vec->operands[vec_index];
998 if (vec_op.isConstant()) {
999 if (vec_op.isLiteral())
1000 ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue());
1001 else if (vec_op.size() == 1)
1002 ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue());
1003 else if (vec_op.size() == 2)
1004 ctx.info[instr->definitions[i].tempId()].set_constant_64bit(vec_op.constantValue());
1005 } else if (vec_op.isUndefined()) {
1006 ctx.info[instr->definitions[i].tempId()].set_undefined();
1007 } else {
1008 assert(vec_op.isTemp());
1009 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1010 }
1011 }
1012 break;
1013 }
1014 case aco_opcode::p_extract_vector: { /* mov */
1015 if (!ctx.info[instr->operands[0].tempId()].is_vec())
1016 break;
1017
1018 /* check if we index directly into a vector element */
1019 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1020 const unsigned index = instr->operands[1].constantValue();
1021 const unsigned dst_offset = index * instr->definitions[0].bytes();
1022 unsigned offset = 0;
1023
1024 for (const Operand& op : vec->operands) {
1025 if (offset < dst_offset) {
1026 offset += op.bytes();
1027 continue;
1028 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1029 break;
1030 }
1031
1032 /* convert this extract into a copy instruction */
1033 instr->opcode = aco_opcode::p_parallelcopy;
1034 instr->operands.pop_back();
1035 instr->operands[0] = op;
1036
1037 if (op.isConstant()) {
1038 if (op.isLiteral())
1039 ctx.info[instr->definitions[0].tempId()].set_literal(op.constantValue());
1040 else if (op.size() == 1)
1041 ctx.info[instr->definitions[0].tempId()].set_constant(op.constantValue());
1042 else if (op.size() == 2)
1043 ctx.info[instr->definitions[0].tempId()].set_constant_64bit(op.constantValue());
1044 } else if (op.isUndefined()) {
1045 ctx.info[instr->definitions[0].tempId()].set_undefined();
1046 } else {
1047 assert(op.isTemp());
1048 ctx.info[instr->definitions[0].tempId()].set_temp(op.getTemp());
1049 }
1050 break;
1051 }
1052 break;
1053 }
1054 case aco_opcode::s_mov_b32: /* propagate */
1055 case aco_opcode::s_mov_b64:
1056 case aco_opcode::v_mov_b32:
1057 case aco_opcode::p_as_uniform:
1058 if (instr->definitions[0].isFixed()) {
1059 /* don't copy-propagate copies into fixed registers */
1060 } else if (instr->usesModifiers()) {
1061 // TODO
1062 } else if (instr->operands[0].isConstant()) {
1063 if (instr->operands[0].isLiteral())
1064 ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue());
1065 else if (instr->operands[0].size() == 1)
1066 ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue());
1067 else if (instr->operands[0].size() == 2)
1068 ctx.info[instr->definitions[0].tempId()].set_constant_64bit(instr->operands[0].constantValue());
1069 } else if (instr->operands[0].isTemp()) {
1070 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1071 } else {
1072 assert(instr->operands[0].isFixed());
1073 }
1074 break;
1075 case aco_opcode::p_is_helper:
1076 if (!ctx.program->needs_wqm)
1077 ctx.info[instr->definitions[0].tempId()].set_constant(0u);
1078 break;
1079 case aco_opcode::s_movk_i32: {
1080 uint32_t v = static_cast<SOPK_instruction*>(instr.get())->imm;
1081 v = v & 0x8000 ? (v | 0xffff0000) : v;
1082 if (v <= 64 || v >= 0xfffffff0)
1083 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1084 else
1085 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1086 break;
1087 }
1088 case aco_opcode::v_bfrev_b32:
1089 case aco_opcode::s_brev_b32: {
1090 if (instr->operands[0].isConstant()) {
1091 uint32_t v = util_bitreverse(instr->operands[0].constantValue());
1092 if (v <= 64 || v >= 0xfffffff0)
1093 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1094 else
1095 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1096 }
1097 break;
1098 }
1099 case aco_opcode::s_bfm_b32: {
1100 if (instr->operands[0].isConstant() && instr->operands[1].isConstant()) {
1101 unsigned size = instr->operands[0].constantValue() & 0x1f;
1102 unsigned start = instr->operands[1].constantValue() & 0x1f;
1103 uint32_t v = ((1u << size) - 1u) << start;
1104 if (v <= 64 || v >= 0xfffffff0)
1105 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1106 else
1107 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1108 }
1109 break;
1110 }
1111 case aco_opcode::v_mul_f16:
1112 case aco_opcode::v_mul_f32: { /* omod */
1113 /* TODO: try to move the negate/abs modifier to the consumer instead */
1114 if (instr->usesModifiers())
1115 break;
1116
1117 bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1118
1119 for (unsigned i = 0; i < 2; i++) {
1120 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1121 if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1122 ctx.info[instr->operands[i].tempId()].set_omod2(instr->definitions[0].getTemp());
1123 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1124 ctx.info[instr->operands[i].tempId()].set_omod4(instr->definitions[0].getTemp());
1125 } else if (instr->operands[!i].constantValue() == (fp16 ? 0xb800 : 0x3f000000)) { /* 0.5 */
1126 ctx.info[instr->operands[i].tempId()].set_omod5(instr->definitions[0].getTemp());
1127 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) &&
1128 !(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32)) { /* 1.0 */
1129 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
1130 } else {
1131 continue;
1132 }
1133 break;
1134 }
1135 }
1136 break;
1137 }
1138 case aco_opcode::v_and_b32: { /* abs */
1139 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1140 instr->operands[1].getTemp().type() == RegType::vgpr &&
1141 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) ||
1142 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu))))
1143 ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp());
1144 else
1145 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1146 break;
1147 }
1148 case aco_opcode::v_xor_b32: { /* neg */
1149 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1150 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) ||
1151 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) {
1152 if (ctx.info[instr->operands[1].tempId()].is_neg()) {
1153 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1154 } else if (instr->operands[1].getTemp().type() == RegType::vgpr) {
1155 if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */
1156 instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp);
1157 instr->opcode = aco_opcode::v_or_b32;
1158 ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp());
1159 } else {
1160 ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp());
1161 }
1162 }
1163 } else {
1164 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1165 }
1166 break;
1167 }
1168 case aco_opcode::v_med3_f16:
1169 case aco_opcode::v_med3_f32: { /* clamp */
1170 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1171 if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1172 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1173 vop3->omod != 0 || vop3->opsel != 0)
1174 break;
1175
1176 unsigned idx = 0;
1177 bool found_zero = false, found_one = false;
1178 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1179 for (unsigned i = 0; i < 3; i++)
1180 {
1181 if (instr->operands[i].constantEquals(0))
1182 found_zero = true;
1183 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1184 found_one = true;
1185 else
1186 idx = i;
1187 }
1188 if (found_zero && found_one && instr->operands[idx].isTemp()) {
1189 ctx.info[instr->operands[idx].tempId()].set_clamp(instr->definitions[0].getTemp());
1190 }
1191 break;
1192 }
1193 case aco_opcode::v_cndmask_b32:
1194 if (instr->operands[0].constantEquals(0) &&
1195 instr->operands[1].constantEquals(0xFFFFFFFF))
1196 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1197 else if (instr->operands[0].constantEquals(0) &&
1198 instr->operands[1].constantEquals(0x3f800000u))
1199 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1200 else if (instr->operands[0].constantEquals(0) &&
1201 instr->operands[1].constantEquals(1))
1202 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1203
1204 ctx.info[instr->operands[2].tempId()].set_vcc_hint();
1205 break;
1206 case aco_opcode::v_cmp_lg_u32:
1207 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1208 instr->operands[0].constantEquals(0) &&
1209 instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc())
1210 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1211 break;
1212 case aco_opcode::p_phi:
1213 case aco_opcode::p_linear_phi: {
1214 /* lower_bool_phis() can create phis like this */
1215 bool all_same_temp = instr->operands[0].isTemp();
1216 /* this check is needed when moving uniform loop counters out of a divergent loop */
1217 if (all_same_temp)
1218 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1219 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1220 if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId())
1221 all_same_temp = false;
1222 }
1223 if (all_same_temp) {
1224 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1225 } else {
1226 bool all_undef = instr->operands[0].isUndefined();
1227 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1228 if (!instr->operands[i].isUndefined())
1229 all_undef = false;
1230 }
1231 if (all_undef)
1232 ctx.info[instr->definitions[0].tempId()].set_undefined();
1233 }
1234 break;
1235 }
1236 case aco_opcode::v_add_u32:
1237 case aco_opcode::v_add_co_u32:
1238 case aco_opcode::v_add_co_u32_e64:
1239 case aco_opcode::s_add_i32:
1240 case aco_opcode::s_add_u32:
1241 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1242 break;
1243 case aco_opcode::s_not_b32:
1244 case aco_opcode::s_not_b64:
1245 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1246 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1247 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
1248 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1249 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1250 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1251 }
1252 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1253 break;
1254 case aco_opcode::s_and_b32:
1255 case aco_opcode::s_and_b64:
1256 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
1257 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1258 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */
1259 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
1260 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp);
1261 break;
1262 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1263 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */
1264 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1265 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1266 break;
1267 }
1268 }
1269 /* fallthrough */
1270 case aco_opcode::s_or_b32:
1271 case aco_opcode::s_or_b64:
1272 case aco_opcode::s_xor_b32:
1273 case aco_opcode::s_xor_b64:
1274 if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) {
1275 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise());
1276 })) {
1277 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1278 }
1279 /* fallthrough */
1280 case aco_opcode::s_lshl_b32:
1281 case aco_opcode::v_or_b32:
1282 case aco_opcode::v_lshlrev_b32:
1283 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1284 break;
1285 case aco_opcode::v_min_f32:
1286 case aco_opcode::v_min_f16:
1287 case aco_opcode::v_min_u32:
1288 case aco_opcode::v_min_i32:
1289 case aco_opcode::v_min_u16:
1290 case aco_opcode::v_min_i16:
1291 case aco_opcode::v_max_f32:
1292 case aco_opcode::v_max_f16:
1293 case aco_opcode::v_max_u32:
1294 case aco_opcode::v_max_i32:
1295 case aco_opcode::v_max_u16:
1296 case aco_opcode::v_max_i16:
1297 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
1298 break;
1299 case aco_opcode::v_cmp_lt_f32:
1300 case aco_opcode::v_cmp_eq_f32:
1301 case aco_opcode::v_cmp_le_f32:
1302 case aco_opcode::v_cmp_gt_f32:
1303 case aco_opcode::v_cmp_lg_f32:
1304 case aco_opcode::v_cmp_ge_f32:
1305 case aco_opcode::v_cmp_o_f32:
1306 case aco_opcode::v_cmp_u_f32:
1307 case aco_opcode::v_cmp_nge_f32:
1308 case aco_opcode::v_cmp_nlg_f32:
1309 case aco_opcode::v_cmp_ngt_f32:
1310 case aco_opcode::v_cmp_nle_f32:
1311 case aco_opcode::v_cmp_neq_f32:
1312 case aco_opcode::v_cmp_nlt_f32:
1313 ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get());
1314 break;
1315 case aco_opcode::s_cselect_b64:
1316 case aco_opcode::s_cselect_b32:
1317 if (instr->operands[0].constantEquals((unsigned) -1) &&
1318 instr->operands[1].constantEquals(0)) {
1319 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
1320 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
1321 }
1322 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
1323 /* Flip the operands to get rid of the scc_invert instruction */
1324 std::swap(instr->operands[0], instr->operands[1]);
1325 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
1326 }
1327 break;
1328 case aco_opcode::p_wqm:
1329 if (instr->operands[0].isTemp() &&
1330 ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1331 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1332 }
1333 break;
1334 default:
1335 break;
1336 }
1337 }
1338
1339 ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse)
1340 {
1341 *ordered = *unordered = op;
1342 switch (op) {
1343 #define CMP(ord, unord) \
1344 case aco_opcode::v_cmp_##ord##_f32:\
1345 case aco_opcode::v_cmp_n##unord##_f32:\
1346 *ordered = aco_opcode::v_cmp_##ord##_f32;\
1347 *unordered = aco_opcode::v_cmp_n##unord##_f32;\
1348 *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\
1349 return true;
1350 CMP(lt, /*n*/ge)
1351 CMP(eq, /*n*/lg)
1352 CMP(le, /*n*/gt)
1353 CMP(gt, /*n*/le)
1354 CMP(lg, /*n*/eq)
1355 CMP(ge, /*n*/lt)
1356 #undef CMP
1357 default:
1358 return false;
1359 }
1360 }
1361
1362 aco_opcode get_ordered(aco_opcode op)
1363 {
1364 aco_opcode ordered, unordered, inverse;
1365 return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::num_opcodes;
1366 }
1367
1368 aco_opcode get_unordered(aco_opcode op)
1369 {
1370 aco_opcode ordered, unordered, inverse;
1371 return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::num_opcodes;
1372 }
1373
1374 aco_opcode get_inverse(aco_opcode op)
1375 {
1376 aco_opcode ordered, unordered, inverse;
1377 return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::num_opcodes;
1378 }
1379
1380 bool is_cmp(aco_opcode op)
1381 {
1382 aco_opcode ordered, unordered, inverse;
1383 return get_cmp_info(op, &ordered, &unordered, &inverse);
1384 }
1385
1386 unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
1387 {
1388 if (ctx.info[tmp.id()].is_temp())
1389 return ctx.info[tmp.id()].temp.id();
1390 else
1391 return tmp.id();
1392 }
1393
1394 void decrease_uses(opt_ctx &ctx, Instruction* instr)
1395 {
1396 if (!--ctx.uses[instr->definitions[0].tempId()]) {
1397 for (const Operand& op : instr->operands) {
1398 if (op.isTemp())
1399 ctx.uses[op.tempId()]--;
1400 }
1401 }
1402 }
1403
1404 Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
1405 {
1406 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_labels))
1407 return nullptr;
1408 if (!ignore_uses && ctx.uses[op.tempId()] > 1)
1409 return nullptr;
1410
1411 Instruction *instr = ctx.info[op.tempId()].instr;
1412
1413 if (instr->definitions.size() == 2) {
1414 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
1415 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1416 return nullptr;
1417 }
1418
1419 return instr;
1420 }
1421
1422 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
1423 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
1424 bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1425 {
1426 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1427 return false;
1428 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1429 return false;
1430
1431 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1432
1433 bool neg[2] = {false, false};
1434 bool abs[2] = {false, false};
1435 uint8_t opsel = 0;
1436 Instruction *op_instr[2];
1437 Temp op[2];
1438
1439 for (unsigned i = 0; i < 2; i++) {
1440 op_instr[i] = follow_operand(ctx, instr->operands[i], true);
1441 if (!op_instr[i])
1442 return false;
1443
1444 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1445
1446 if (op_instr[i]->opcode != expected_cmp)
1447 return false;
1448 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
1449 return false;
1450
1451 if (op_instr[i]->isVOP3()) {
1452 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(op_instr[i]);
1453 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1454 return false;
1455 neg[i] = vop3->neg[0];
1456 abs[i] = vop3->abs[0];
1457 opsel |= (vop3->opsel & 1) << i;
1458 }
1459
1460 Temp op0 = op_instr[i]->operands[0].getTemp();
1461 Temp op1 = op_instr[i]->operands[1].getTemp();
1462 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
1463 return false;
1464
1465 op[i] = op1;
1466 }
1467
1468 if (op[1].type() == RegType::sgpr)
1469 std::swap(op[0], op[1]);
1470 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
1471 if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
1472 return false;
1473
1474 ctx.uses[op[0].id()]++;
1475 ctx.uses[op[1].id()]++;
1476 decrease_uses(ctx, op_instr[0]);
1477 decrease_uses(ctx, op_instr[1]);
1478
1479 aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1480 Instruction *new_instr;
1481 if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
1482 VOP3A_instruction *vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1483 for (unsigned i = 0; i < 2; i++) {
1484 vop3->neg[i] = neg[i];
1485 vop3->abs[i] = abs[i];
1486 }
1487 vop3->opsel = opsel;
1488 new_instr = static_cast<Instruction *>(vop3);
1489 } else {
1490 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1491 }
1492 new_instr->operands[0] = Operand(op[0]);
1493 new_instr->operands[1] = Operand(op[1]);
1494 new_instr->definitions[0] = instr->definitions[0];
1495
1496 ctx.info[instr->definitions[0].tempId()].label = 0;
1497 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1498
1499 instr.reset(new_instr);
1500
1501 return true;
1502 }
1503
1504 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
1505 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
1506 bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1507 {
1508 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1509 return false;
1510 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1511 return false;
1512
1513 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1514 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1515
1516 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1517 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1518 if (!nan_test || !cmp)
1519 return false;
1520
1521 if (cmp->opcode == expected_nan_test)
1522 std::swap(nan_test, cmp);
1523 else if (nan_test->opcode != expected_nan_test)
1524 return false;
1525
1526 if (!is_cmp(cmp->opcode))
1527 return false;
1528
1529 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1530 return false;
1531 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
1532 return false;
1533
1534 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
1535 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
1536 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1537 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1538 if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
1539 return false;
1540 if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
1541 return false;
1542
1543 ctx.uses[cmp->operands[0].tempId()]++;
1544 ctx.uses[cmp->operands[1].tempId()]++;
1545 decrease_uses(ctx, nan_test);
1546 decrease_uses(ctx, cmp);
1547
1548 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1549 Instruction *new_instr;
1550 if (cmp->isVOP3()) {
1551 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1552 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1553 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1554 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1555 new_vop3->clamp = cmp_vop3->clamp;
1556 new_vop3->omod = cmp_vop3->omod;
1557 new_vop3->opsel = cmp_vop3->opsel;
1558 new_instr = new_vop3;
1559 } else {
1560 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1561 }
1562 new_instr->operands[0] = cmp->operands[0];
1563 new_instr->operands[1] = cmp->operands[1];
1564 new_instr->definitions[0] = instr->definitions[0];
1565
1566 ctx.info[instr->definitions[0].tempId()].label = 0;
1567 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1568
1569 instr.reset(new_instr);
1570
1571 return true;
1572 }
1573
1574 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
1575 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
1576 bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1577 {
1578 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1579 return false;
1580 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1581 return false;
1582
1583 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1584
1585 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1586 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1587
1588 if (!nan_test || !cmp)
1589 return false;
1590
1591 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1592 if (cmp->opcode == expected_nan_test)
1593 std::swap(nan_test, cmp);
1594 else if (nan_test->opcode != expected_nan_test)
1595 return false;
1596
1597 if (!is_cmp(cmp->opcode))
1598 return false;
1599
1600 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1601 return false;
1602 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
1603 return false;
1604
1605 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1606 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1607 if (prop_nan0 != prop_nan1)
1608 return false;
1609
1610 if (nan_test->isVOP3()) {
1611 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(nan_test);
1612 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1613 return false;
1614 }
1615
1616 int constant_operand = -1;
1617 for (unsigned i = 0; i < 2; i++) {
1618 if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
1619 constant_operand = !i;
1620 break;
1621 }
1622 }
1623 if (constant_operand == -1)
1624 return false;
1625
1626 uint32_t constant;
1627 if (cmp->operands[constant_operand].isConstant()) {
1628 constant = cmp->operands[constant_operand].constantValue();
1629 } else if (cmp->operands[constant_operand].isTemp()) {
1630 Temp tmp = cmp->operands[constant_operand].getTemp();
1631 unsigned id = original_temp_id(ctx, tmp);
1632 if (!ctx.info[id].is_constant() && !ctx.info[id].is_literal())
1633 return false;
1634 constant = ctx.info[id].val;
1635 } else {
1636 return false;
1637 }
1638
1639 float constantf;
1640 memcpy(&constantf, &constant, 4);
1641 if (isnan(constantf))
1642 return false;
1643
1644 if (cmp->operands[0].isTemp())
1645 ctx.uses[cmp->operands[0].tempId()]++;
1646 if (cmp->operands[1].isTemp())
1647 ctx.uses[cmp->operands[1].tempId()]++;
1648 decrease_uses(ctx, nan_test);
1649 decrease_uses(ctx, cmp);
1650
1651 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1652 Instruction *new_instr;
1653 if (cmp->isVOP3()) {
1654 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1655 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1656 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1657 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1658 new_vop3->clamp = cmp_vop3->clamp;
1659 new_vop3->omod = cmp_vop3->omod;
1660 new_vop3->opsel = cmp_vop3->opsel;
1661 new_instr = new_vop3;
1662 } else {
1663 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1664 }
1665 new_instr->operands[0] = cmp->operands[0];
1666 new_instr->operands[1] = cmp->operands[1];
1667 new_instr->definitions[0] = instr->definitions[0];
1668
1669 ctx.info[instr->definitions[0].tempId()].label = 0;
1670 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1671
1672 instr.reset(new_instr);
1673
1674 return true;
1675 }
1676
1677 /* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */
1678 bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1679 {
1680 if (instr->opcode != aco_opcode::s_not_b64)
1681 return false;
1682 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1683 return false;
1684 if (!instr->operands[0].isTemp())
1685 return false;
1686
1687 Instruction *cmp = follow_operand(ctx, instr->operands[0]);
1688 if (!cmp)
1689 return false;
1690
1691 aco_opcode new_opcode = get_inverse(cmp->opcode);
1692 if (new_opcode == aco_opcode::num_opcodes)
1693 return false;
1694
1695 if (cmp->operands[0].isTemp())
1696 ctx.uses[cmp->operands[0].tempId()]++;
1697 if (cmp->operands[1].isTemp())
1698 ctx.uses[cmp->operands[1].tempId()]++;
1699 decrease_uses(ctx, cmp);
1700
1701 Instruction *new_instr;
1702 if (cmp->isVOP3()) {
1703 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
1704 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1705 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1706 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1707 new_vop3->clamp = cmp_vop3->clamp;
1708 new_vop3->omod = cmp_vop3->omod;
1709 new_vop3->opsel = cmp_vop3->opsel;
1710 new_instr = new_vop3;
1711 } else {
1712 new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
1713 }
1714 new_instr->operands[0] = cmp->operands[0];
1715 new_instr->operands[1] = cmp->operands[1];
1716 new_instr->definitions[0] = instr->definitions[0];
1717
1718 ctx.info[instr->definitions[0].tempId()].label = 0;
1719 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1720
1721 instr.reset(new_instr);
1722
1723 return true;
1724 }
1725
1726 /* op1(op2(1, 2), 0) if swap = false
1727 * op1(0, op2(1, 2)) if swap = true */
1728 bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
1729 Instruction* op1_instr, bool swap, const char *shuffle_str,
1730 Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel,
1731 bool *op1_clamp, uint8_t *op1_omod,
1732 bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel)
1733 {
1734 /* checks */
1735 if (op1_instr->opcode != op1)
1736 return false;
1737
1738 Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
1739 if (!op2_instr || op2_instr->opcode != op2)
1740 return false;
1741 if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
1742 return false;
1743
1744 VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op1_instr) : NULL;
1745 VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op2_instr) : NULL;
1746
1747 /* don't support inbetween clamp/omod */
1748 if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
1749 return false;
1750
1751 /* get operands and modifiers and check inbetween modifiers */
1752 *op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
1753 *op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
1754
1755 if (inbetween_neg)
1756 *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
1757 else if (op1_vop3 && op1_vop3->neg[swap])
1758 return false;
1759
1760 if (inbetween_abs)
1761 *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
1762 else if (op1_vop3 && op1_vop3->abs[swap])
1763 return false;
1764
1765 if (inbetween_opsel)
1766 *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << swap) : false;
1767 else if (op1_vop3 && op1_vop3->opsel & (1 << swap))
1768 return false;
1769
1770 int shuffle[3];
1771 shuffle[shuffle_str[0] - '0'] = 0;
1772 shuffle[shuffle_str[1] - '0'] = 1;
1773 shuffle[shuffle_str[2] - '0'] = 2;
1774
1775 operands[shuffle[0]] = op1_instr->operands[!swap];
1776 neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
1777 abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
1778 if (op1_vop3 && op1_vop3->opsel & (1 << !swap))
1779 *opsel |= 1 << shuffle[0];
1780
1781 for (unsigned i = 0; i < 2; i++) {
1782 operands[shuffle[i + 1]] = op2_instr->operands[i];
1783 neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
1784 abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
1785 if (op2_vop3 && op2_vop3->opsel & (1 << i))
1786 *opsel |= 1 << shuffle[i + 1];
1787 }
1788
1789 /* check operands */
1790 if (!check_vop3_operands(ctx, 3, operands))
1791 return false;
1792
1793 return true;
1794 }
1795
1796 void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
1797 Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel,
1798 bool clamp, unsigned omod)
1799 {
1800 VOP3A_instruction *new_instr = create_instruction<VOP3A_instruction>(opcode, Format::VOP3A, 3, 1);
1801 memcpy(new_instr->abs, abs, sizeof(bool[3]));
1802 memcpy(new_instr->neg, neg, sizeof(bool[3]));
1803 new_instr->clamp = clamp;
1804 new_instr->omod = omod;
1805 new_instr->opsel = opsel;
1806 new_instr->operands[0] = operands[0];
1807 new_instr->operands[1] = operands[1];
1808 new_instr->operands[2] = operands[2];
1809 new_instr->definitions[0] = instr->definitions[0];
1810 ctx.info[instr->definitions[0].tempId()].label = 0;
1811
1812 instr.reset(new_instr);
1813 }
1814
1815 bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops)
1816 {
1817 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
1818 (label_omod_success | label_clamp_success);
1819
1820 for (unsigned swap = 0; swap < 2; swap++) {
1821 if (!((1 << swap) & ops))
1822 continue;
1823
1824 Operand operands[3];
1825 bool neg[3], abs[3], clamp;
1826 uint8_t opsel = 0, omod = 0;
1827 if (match_op3_for_vop3(ctx, instr->opcode, op2,
1828 instr.get(), swap, shuffle,
1829 operands, neg, abs, &opsel,
1830 &clamp, &omod, NULL, NULL, NULL)) {
1831 ctx.uses[instr->operands[swap].tempId()]--;
1832 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
1833 if (omod_clamp & label_omod_success)
1834 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
1835 if (omod_clamp & label_clamp_success)
1836 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
1837 return true;
1838 }
1839 }
1840 return false;
1841 }
1842
1843 bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
1844 {
1845 if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
1846 return true;
1847
1848 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
1849 (label_omod_success | label_clamp_success);
1850
1851 /* min(-max(a, b), c) -> min3(-a, -b, c) *
1852 * max(-min(a, b), c) -> max3(-a, -b, c) */
1853 for (unsigned swap = 0; swap < 2; swap++) {
1854 Operand operands[3];
1855 bool neg[3], abs[3], clamp;
1856 uint8_t opsel = 0, omod = 0;
1857 bool inbetween_neg;
1858 if (match_op3_for_vop3(ctx, instr->opcode, opposite,
1859 instr.get(), swap, "012",
1860 operands, neg, abs, &opsel,
1861 &clamp, &omod, &inbetween_neg, NULL, NULL) &&
1862 inbetween_neg) {
1863 ctx.uses[instr->operands[swap].tempId()]--;
1864 neg[1] = true;
1865 neg[2] = true;
1866 create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
1867 if (omod_clamp & label_omod_success)
1868 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
1869 if (omod_clamp & label_clamp_success)
1870 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
1871 return true;
1872 }
1873 }
1874 return false;
1875 }
1876
1877 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
1878 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
1879 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
1880 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
1881 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
1882 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
1883 bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1884 {
1885 /* checks */
1886 if (!instr->operands[0].isTemp())
1887 return false;
1888 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1889 return false;
1890
1891 Instruction *op2_instr = follow_operand(ctx, instr->operands[0]);
1892 if (!op2_instr)
1893 return false;
1894 switch (op2_instr->opcode) {
1895 case aco_opcode::s_and_b32:
1896 case aco_opcode::s_or_b32:
1897 case aco_opcode::s_xor_b32:
1898 case aco_opcode::s_and_b64:
1899 case aco_opcode::s_or_b64:
1900 case aco_opcode::s_xor_b64:
1901 break;
1902 default:
1903 return false;
1904 }
1905
1906 /* create instruction */
1907 std::swap(instr->definitions[0], op2_instr->definitions[0]);
1908 std::swap(instr->definitions[1], op2_instr->definitions[1]);
1909 ctx.uses[instr->operands[0].tempId()]--;
1910 ctx.info[op2_instr->definitions[0].tempId()].label = 0;
1911
1912 switch (op2_instr->opcode) {
1913 case aco_opcode::s_and_b32:
1914 op2_instr->opcode = aco_opcode::s_nand_b32;
1915 break;
1916 case aco_opcode::s_or_b32:
1917 op2_instr->opcode = aco_opcode::s_nor_b32;
1918 break;
1919 case aco_opcode::s_xor_b32:
1920 op2_instr->opcode = aco_opcode::s_xnor_b32;
1921 break;
1922 case aco_opcode::s_and_b64:
1923 op2_instr->opcode = aco_opcode::s_nand_b64;
1924 break;
1925 case aco_opcode::s_or_b64:
1926 op2_instr->opcode = aco_opcode::s_nor_b64;
1927 break;
1928 case aco_opcode::s_xor_b64:
1929 op2_instr->opcode = aco_opcode::s_xnor_b64;
1930 break;
1931 default:
1932 break;
1933 }
1934
1935 return true;
1936 }
1937
1938 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
1939 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
1940 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
1941 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
1942 bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1943 {
1944 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
1945 return false;
1946
1947 for (unsigned i = 0; i < 2; i++) {
1948 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
1949 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64))
1950 continue;
1951 if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
1952 continue;
1953
1954 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
1955 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
1956 continue;
1957
1958 ctx.uses[instr->operands[i].tempId()]--;
1959 instr->operands[0] = instr->operands[!i];
1960 instr->operands[1] = op2_instr->operands[0];
1961 ctx.info[instr->definitions[0].tempId()].label = 0;
1962
1963 switch (instr->opcode) {
1964 case aco_opcode::s_and_b32:
1965 instr->opcode = aco_opcode::s_andn2_b32;
1966 break;
1967 case aco_opcode::s_or_b32:
1968 instr->opcode = aco_opcode::s_orn2_b32;
1969 break;
1970 case aco_opcode::s_and_b64:
1971 instr->opcode = aco_opcode::s_andn2_b64;
1972 break;
1973 case aco_opcode::s_or_b64:
1974 instr->opcode = aco_opcode::s_orn2_b64;
1975 break;
1976 default:
1977 break;
1978 }
1979
1980 return true;
1981 }
1982 return false;
1983 }
1984
1985 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
1986 bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1987 {
1988 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
1989 return false;
1990
1991 for (unsigned i = 0; i < 2; i++) {
1992 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
1993 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
1994 ctx.uses[op2_instr->definitions[1].tempId()])
1995 continue;
1996 if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
1997 continue;
1998
1999 uint32_t shift = op2_instr->operands[1].constantValue();
2000 if (shift < 1 || shift > 4)
2001 continue;
2002
2003 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2004 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2005 continue;
2006
2007 ctx.uses[instr->operands[i].tempId()]--;
2008 instr->operands[1] = instr->operands[!i];
2009 instr->operands[0] = op2_instr->operands[0];
2010 ctx.info[instr->definitions[0].tempId()].label = 0;
2011
2012 instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32,
2013 aco_opcode::s_lshl2_add_u32,
2014 aco_opcode::s_lshl3_add_u32,
2015 aco_opcode::s_lshl4_add_u32})[shift - 1];
2016
2017 return true;
2018 }
2019 return false;
2020 }
2021
2022 bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
2023 {
2024 if (instr->usesModifiers())
2025 return false;
2026
2027 for (unsigned i = 0; i < 2; i++) {
2028 if (!((1 << i) & ops))
2029 continue;
2030 if (instr->operands[i].isTemp() &&
2031 ctx.info[instr->operands[i].tempId()].is_b2i() &&
2032 ctx.uses[instr->operands[i].tempId()] == 1) {
2033
2034 aco_ptr<Instruction> new_instr;
2035 if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2036 new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
2037 } else if (ctx.program->chip_class >= GFX10 ||
2038 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
2039 new_instr.reset(create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
2040 } else {
2041 return false;
2042 }
2043 ctx.uses[instr->operands[i].tempId()]--;
2044 new_instr->definitions[0] = instr->definitions[0];
2045 new_instr->definitions[1] = instr->definitions.size() == 2 ? instr->definitions[1] :
2046 Definition(ctx.program->allocateId(), ctx.program->lane_mask);
2047 new_instr->definitions[1].setHint(vcc);
2048 new_instr->operands[0] = Operand(0u);
2049 new_instr->operands[1] = instr->operands[!i];
2050 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2051 instr = std::move(new_instr);
2052 ctx.info[instr->definitions[0].tempId()].label = 0;
2053 return true;
2054 }
2055 }
2056
2057 return false;
2058 }
2059
2060 bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
2061 {
2062 switch (op) {
2063 #define MINMAX(type, gfx9) \
2064 case aco_opcode::v_min_##type:\
2065 case aco_opcode::v_max_##type:\
2066 case aco_opcode::v_med3_##type:\
2067 *min = aco_opcode::v_min_##type;\
2068 *max = aco_opcode::v_max_##type;\
2069 *med3 = aco_opcode::v_med3_##type;\
2070 *min3 = aco_opcode::v_min3_##type;\
2071 *max3 = aco_opcode::v_max3_##type;\
2072 *some_gfx9_only = gfx9;\
2073 return true;
2074 MINMAX(f32, false)
2075 MINMAX(u32, false)
2076 MINMAX(i32, false)
2077 MINMAX(f16, true)
2078 MINMAX(u16, true)
2079 MINMAX(i16, true)
2080 #undef MINMAX
2081 default:
2082 return false;
2083 }
2084 }
2085
2086 /* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb
2087 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */
2088 bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
2089 aco_opcode min, aco_opcode max, aco_opcode med)
2090 {
2091 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
2092 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
2093 * minVal > maxVal, which means we can always select it to a v_med3_f32 */
2094 aco_opcode other_op;
2095 if (instr->opcode == min)
2096 other_op = max;
2097 else if (instr->opcode == max)
2098 other_op = min;
2099 else
2100 return false;
2101
2102 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
2103 (label_omod_success | label_clamp_success);
2104
2105 for (unsigned swap = 0; swap < 2; swap++) {
2106 Operand operands[3];
2107 bool neg[3], abs[3], clamp;
2108 uint8_t opsel = 0, omod = 0;
2109 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap,
2110 "012", operands, neg, abs, &opsel,
2111 &clamp, &omod, NULL, NULL, NULL)) {
2112 int const0_idx = -1, const1_idx = -1;
2113 uint32_t const0 = 0, const1 = 0;
2114 for (int i = 0; i < 3; i++) {
2115 uint32_t val;
2116 if (operands[i].isConstant()) {
2117 val = operands[i].constantValue();
2118 } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal()) {
2119 val = ctx.info[operands[i].tempId()].val;
2120 } else {
2121 continue;
2122 }
2123 if (const0_idx >= 0) {
2124 const1_idx = i;
2125 const1 = val;
2126 } else {
2127 const0_idx = i;
2128 const0 = val;
2129 }
2130 }
2131 if (const0_idx < 0 || const1_idx < 0)
2132 continue;
2133
2134 if (opsel & (1 << const0_idx))
2135 const0 >>= 16;
2136 if (opsel & (1 << const1_idx))
2137 const1 >>= 16;
2138
2139 int lower_idx = const0_idx;
2140 switch (min) {
2141 case aco_opcode::v_min_f32:
2142 case aco_opcode::v_min_f16: {
2143 float const0_f, const1_f;
2144 if (min == aco_opcode::v_min_f32) {
2145 memcpy(&const0_f, &const0, 4);
2146 memcpy(&const1_f, &const1, 4);
2147 } else {
2148 const0_f = _mesa_half_to_float(const0);
2149 const1_f = _mesa_half_to_float(const1);
2150 }
2151 if (abs[const0_idx]) const0_f = fabsf(const0_f);
2152 if (abs[const1_idx]) const1_f = fabsf(const1_f);
2153 if (neg[const0_idx]) const0_f = -const0_f;
2154 if (neg[const1_idx]) const1_f = -const1_f;
2155 lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
2156 break;
2157 }
2158 case aco_opcode::v_min_u32: {
2159 lower_idx = const0 < const1 ? const0_idx : const1_idx;
2160 break;
2161 }
2162 case aco_opcode::v_min_u16: {
2163 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
2164 break;
2165 }
2166 case aco_opcode::v_min_i32: {
2167 int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
2168 int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
2169 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2170 break;
2171 }
2172 case aco_opcode::v_min_i16: {
2173 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
2174 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
2175 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2176 break;
2177 }
2178 default:
2179 break;
2180 }
2181 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
2182
2183 if (instr->opcode == min) {
2184 if (upper_idx != 0 || lower_idx == 0)
2185 return false;
2186 } else {
2187 if (upper_idx == 0 || lower_idx != 0)
2188 return false;
2189 }
2190
2191 ctx.uses[instr->operands[swap].tempId()]--;
2192 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
2193 if (omod_clamp & label_omod_success)
2194 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
2195 if (omod_clamp & label_clamp_success)
2196 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
2197
2198 return true;
2199 }
2200 }
2201
2202 return false;
2203 }
2204
2205
2206 void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2207 {
2208 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2209 instr->opcode == aco_opcode::v_lshrrev_b64 ||
2210 instr->opcode == aco_opcode::v_ashrrev_i64;
2211
2212 /* find candidates and create the set of sgprs already read */
2213 unsigned sgpr_ids[2] = {0, 0};
2214 uint32_t operand_mask = 0;
2215 bool has_literal = false;
2216 for (unsigned i = 0; i < instr->operands.size(); i++) {
2217 if (instr->operands[i].isLiteral())
2218 has_literal = true;
2219 if (!instr->operands[i].isTemp())
2220 continue;
2221 if (instr->operands[i].getTemp().type() == RegType::sgpr) {
2222 if (instr->operands[i].tempId() != sgpr_ids[0])
2223 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
2224 }
2225 ssa_info& info = ctx.info[instr->operands[i].tempId()];
2226 if (info.is_temp() && info.temp.type() == RegType::sgpr)
2227 operand_mask |= 1u << i;
2228 }
2229 unsigned max_sgprs = 1;
2230 if (ctx.program->chip_class >= GFX10 && !is_shift64)
2231 max_sgprs = 2;
2232 if (has_literal)
2233 max_sgprs--;
2234
2235 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2236
2237 /* keep on applying sgprs until there is nothing left to be done */
2238 while (operand_mask) {
2239 uint32_t sgpr_idx = 0;
2240 uint32_t sgpr_info_id = 0;
2241 uint32_t mask = operand_mask;
2242 /* choose a sgpr */
2243 while (mask) {
2244 unsigned i = u_bit_scan(&mask);
2245 uint16_t uses = ctx.uses[instr->operands[i].tempId()];
2246 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
2247 sgpr_idx = i;
2248 sgpr_info_id = instr->operands[i].tempId();
2249 }
2250 }
2251 operand_mask &= ~(1u << sgpr_idx);
2252
2253 /* Applying two sgprs require making it VOP3, so don't do it unless it's
2254 * definitively beneficial.
2255 * TODO: this is too conservative because later the use count could be reduced to 1 */
2256 if (num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3())
2257 break;
2258
2259 Temp sgpr = ctx.info[sgpr_info_id].temp;
2260 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
2261 if (new_sgpr && num_sgprs >= max_sgprs)
2262 continue;
2263
2264 if (sgpr_idx == 0 || instr->isVOP3()) {
2265 instr->operands[sgpr_idx] = Operand(sgpr);
2266 } else if (can_swap_operands(instr)) {
2267 instr->operands[sgpr_idx] = instr->operands[0];
2268 instr->operands[0] = Operand(sgpr);
2269 /* swap bits using a 4-entry LUT */
2270 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
2271 operand_mask = (operand_mask & ~0x3) | swapped;
2272 } else if (can_use_VOP3(ctx, instr)) {
2273 to_VOP3(ctx, instr);
2274 instr->operands[sgpr_idx] = Operand(sgpr);
2275 } else {
2276 continue;
2277 }
2278
2279 if (new_sgpr)
2280 sgpr_ids[num_sgprs++] = sgpr.id();
2281 ctx.uses[sgpr_info_id]--;
2282 ctx.uses[sgpr.id()]++;
2283 }
2284 }
2285
2286 bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2287 {
2288 /* check if we could apply omod on predecessor */
2289 if (instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16) {
2290 bool op0 = instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_omod_success();
2291 bool op1 = instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_omod_success();
2292 if (op0 || op1) {
2293 unsigned idx = op0 ? 0 : 1;
2294 /* omod was successfully applied */
2295 /* if the omod instruction is v_mad, we also have to change the original add */
2296 if (ctx.info[instr->operands[idx].tempId()].is_mad()) {
2297 Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get();
2298 if (ctx.info[instr->definitions[0].tempId()].is_clamp())
2299 static_cast<VOP3A_instruction*>(add_instr)->clamp = true;
2300 add_instr->definitions[0] = instr->definitions[0];
2301 }
2302
2303 Instruction* omod_instr = ctx.info[instr->operands[idx].tempId()].instr;
2304 /* check if we have an additional clamp modifier */
2305 if (ctx.info[instr->definitions[0].tempId()].is_clamp() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
2306 ctx.uses[ctx.info[instr->definitions[0].tempId()].temp.id()]) {
2307 static_cast<VOP3A_instruction*>(omod_instr)->clamp = true;
2308 ctx.info[instr->definitions[0].tempId()].set_clamp_success(omod_instr);
2309 }
2310 /* change definition ssa-id of modified instruction */
2311 omod_instr->definitions[0] = instr->definitions[0];
2312
2313 /* change the definition of instr to something unused, e.g. the original omod def */
2314 instr->definitions[0] = Definition(instr->operands[idx].getTemp());
2315 ctx.uses[instr->definitions[0].tempId()] = 0;
2316 return true;
2317 }
2318 if (!ctx.info[instr->definitions[0].tempId()].label) {
2319 /* in all other cases, label this instruction as option for multiply-add */
2320 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
2321 }
2322 }
2323
2324 /* check if we could apply clamp on predecessor */
2325 if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
2326 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
2327 unsigned idx = 0;
2328 bool found_zero = false, found_one = false;
2329 for (unsigned i = 0; i < 3; i++)
2330 {
2331 if (instr->operands[i].constantEquals(0))
2332 found_zero = true;
2333 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
2334 found_one = true;
2335 else
2336 idx = i;
2337 }
2338 if (found_zero && found_one && instr->operands[idx].isTemp() &&
2339 ctx.info[instr->operands[idx].tempId()].is_clamp_success()) {
2340 /* clamp was successfully applied */
2341 /* if the clamp instruction is v_mad, we also have to change the original add */
2342 if (ctx.info[instr->operands[idx].tempId()].is_mad()) {
2343 Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get();
2344 add_instr->definitions[0] = instr->definitions[0];
2345 }
2346 Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr;
2347 /* change definition ssa-id of modified instruction */
2348 clamp_instr->definitions[0] = instr->definitions[0];
2349
2350 /* change the definition of instr to something unused, e.g. the original omod def */
2351 instr->definitions[0] = Definition(instr->operands[idx].getTemp());
2352 ctx.uses[instr->definitions[0].tempId()] = 0;
2353 return true;
2354 }
2355 }
2356
2357 /* omod has no effect if denormals are enabled */
2358 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
2359 if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
2360 can_use_VOP3(ctx, instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) {
2361 bool can_use_omod = (instr->definitions[0].bytes() == 4 ? block.fp_mode.denorm32 : block.fp_mode.denorm16_64) == 0;
2362 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
2363 if (can_use_omod && def_info.is_omod2() && ctx.uses[def_info.temp.id()]) {
2364 to_VOP3(ctx, instr);
2365 static_cast<VOP3A_instruction*>(instr.get())->omod = 1;
2366 def_info.set_omod_success(instr.get());
2367 } else if (can_use_omod && def_info.is_omod4() && ctx.uses[def_info.temp.id()]) {
2368 to_VOP3(ctx, instr);
2369 static_cast<VOP3A_instruction*>(instr.get())->omod = 2;
2370 def_info.set_omod_success(instr.get());
2371 } else if (can_use_omod && def_info.is_omod5() && ctx.uses[def_info.temp.id()]) {
2372 to_VOP3(ctx, instr);
2373 static_cast<VOP3A_instruction*>(instr.get())->omod = 3;
2374 def_info.set_omod_success(instr.get());
2375 } else if (def_info.is_clamp() && ctx.uses[def_info.temp.id()]) {
2376 to_VOP3(ctx, instr);
2377 static_cast<VOP3A_instruction*>(instr.get())->clamp = true;
2378 def_info.set_clamp_success(instr.get());
2379 }
2380 }
2381
2382 return false;
2383 }
2384
2385 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
2386 // this would mean that we'd have to fix the instruction uses while value propagation
2387
2388 void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2389 {
2390 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
2391 return;
2392
2393 if (instr->isVALU()) {
2394 if (can_apply_sgprs(instr))
2395 apply_sgprs(ctx, instr);
2396 if (apply_omod_clamp(ctx, block, instr))
2397 return;
2398 }
2399
2400 if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
2401 instr->definitions[0].setHint(vcc);
2402 }
2403
2404 /* TODO: There are still some peephole optimizations that could be done:
2405 * - abs(a - b) -> s_absdiff_i32
2406 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
2407 * - patterns for v_alignbit_b32 and v_alignbyte_b32
2408 * These aren't probably too interesting though.
2409 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
2410 * probably more useful than the previously mentioned optimizations.
2411 * The various comparison optimizations also currently only work with 32-bit
2412 * floats. */
2413
2414 /* neg(mul(a, b)) -> mul(neg(a), b) */
2415 if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) {
2416 Temp val = ctx.info[instr->definitions[0].tempId()].temp;
2417
2418 if (!ctx.info[val.id()].is_mul())
2419 return;
2420
2421 Instruction* mul_instr = ctx.info[val.id()].instr;
2422
2423 if (mul_instr->operands[0].isLiteral())
2424 return;
2425 if (mul_instr->isVOP3() && static_cast<VOP3A_instruction*>(mul_instr)->clamp)
2426 return;
2427
2428 /* convert to mul(neg(a), b) */
2429 ctx.uses[mul_instr->definitions[0].tempId()]--;
2430 Definition def = instr->definitions[0];
2431 /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
2432 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
2433 instr.reset(create_instruction<VOP3A_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
2434 instr->operands[0] = mul_instr->operands[0];
2435 instr->operands[1] = mul_instr->operands[1];
2436 instr->definitions[0] = def;
2437 VOP3A_instruction* new_mul = static_cast<VOP3A_instruction*>(instr.get());
2438 if (mul_instr->isVOP3()) {
2439 VOP3A_instruction* mul = static_cast<VOP3A_instruction*>(mul_instr);
2440 new_mul->neg[0] = mul->neg[0] && !is_abs;
2441 new_mul->neg[1] = mul->neg[1] && !is_abs;
2442 new_mul->abs[0] = mul->abs[0] || is_abs;
2443 new_mul->abs[1] = mul->abs[1] || is_abs;
2444 new_mul->omod = mul->omod;
2445 }
2446 new_mul->neg[0] ^= true;
2447 new_mul->clamp = false;
2448
2449 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
2450 return;
2451 }
2452
2453 /* combine mul+add -> mad */
2454 bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
2455 instr->opcode == aco_opcode::v_sub_f32 ||
2456 instr->opcode == aco_opcode::v_subrev_f32;
2457 bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
2458 instr->opcode == aco_opcode::v_sub_f16 ||
2459 instr->opcode == aco_opcode::v_subrev_f16;
2460 if (mad16 || mad32) {
2461 bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 :
2462 (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
2463 if (need_fma && instr->definitions[0].isPrecise())
2464 return;
2465 if (need_fma && mad32 && !ctx.program->has_fast_fma32)
2466 return;
2467
2468 uint32_t uses_src0 = UINT32_MAX;
2469 uint32_t uses_src1 = UINT32_MAX;
2470 Instruction* mul_instr = nullptr;
2471 unsigned add_op_idx;
2472 /* check if any of the operands is a multiplication */
2473 ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL;
2474 ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL;
2475 if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise()))
2476 uses_src0 = ctx.uses[instr->operands[0].tempId()];
2477 if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise()))
2478 uses_src1 = ctx.uses[instr->operands[1].tempId()];
2479
2480 /* find the 'best' mul instruction to combine with the add */
2481 if (uses_src0 < uses_src1) {
2482 mul_instr = op0_info->instr;
2483 add_op_idx = 1;
2484 } else if (uses_src1 < uses_src0) {
2485 mul_instr = op1_info->instr;
2486 add_op_idx = 0;
2487 } else if (uses_src0 != UINT32_MAX) {
2488 /* tiebreaker: quite random what to pick */
2489 if (op0_info->instr->operands[0].isLiteral()) {
2490 mul_instr = op1_info->instr;
2491 add_op_idx = 0;
2492 } else {
2493 mul_instr = op0_info->instr;
2494 add_op_idx = 1;
2495 }
2496 }
2497 if (mul_instr) {
2498 Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)};
2499 bool neg[3] = {false, false, false};
2500 bool abs[3] = {false, false, false};
2501 unsigned omod = 0;
2502 bool clamp = false;
2503 op[0] = mul_instr->operands[0];
2504 op[1] = mul_instr->operands[1];
2505 op[2] = instr->operands[add_op_idx];
2506 // TODO: would be better to check this before selecting a mul instr?
2507 if (!check_vop3_operands(ctx, 3, op))
2508 return;
2509
2510 if (mul_instr->isVOP3()) {
2511 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr);
2512 neg[0] = vop3->neg[0];
2513 neg[1] = vop3->neg[1];
2514 abs[0] = vop3->abs[0];
2515 abs[1] = vop3->abs[1];
2516 /* we cannot use these modifiers between mul and add */
2517 if (vop3->clamp || vop3->omod)
2518 return;
2519 }
2520
2521 /* convert to mad */
2522 ctx.uses[mul_instr->definitions[0].tempId()]--;
2523 if (ctx.uses[mul_instr->definitions[0].tempId()]) {
2524 if (op[0].isTemp())
2525 ctx.uses[op[0].tempId()]++;
2526 if (op[1].isTemp())
2527 ctx.uses[op[1].tempId()]++;
2528 }
2529
2530 if (instr->isVOP3()) {
2531 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (instr.get());
2532 neg[2] = vop3->neg[add_op_idx];
2533 abs[2] = vop3->abs[add_op_idx];
2534 omod = vop3->omod;
2535 clamp = vop3->clamp;
2536 /* abs of the multiplication result */
2537 if (vop3->abs[1 - add_op_idx]) {
2538 neg[0] = false;
2539 neg[1] = false;
2540 abs[0] = true;
2541 abs[1] = true;
2542 }
2543 /* neg of the multiplication result */
2544 neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx];
2545 }
2546 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
2547 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
2548 else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
2549 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
2550
2551 aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
2552 if (mad16)
2553 mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
2554 (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
2555
2556 aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
2557 for (unsigned i = 0; i < 3; i++)
2558 {
2559 mad->operands[i] = op[i];
2560 mad->neg[i] = neg[i];
2561 mad->abs[i] = abs[i];
2562 }
2563 mad->omod = omod;
2564 mad->clamp = clamp;
2565 mad->definitions[0] = instr->definitions[0];
2566
2567 /* mark this ssa_def to be re-checked for profitability and literals */
2568 ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
2569 ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
2570 instr.reset(mad.release());
2571 return;
2572 }
2573 }
2574 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
2575 else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
2576 for (unsigned i = 0; i < 2; i++) {
2577 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
2578 ctx.uses[instr->operands[i].tempId()] == 1 &&
2579 instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2580 ctx.uses[instr->operands[i].tempId()]--;
2581 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
2582
2583 aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
2584 new_instr->operands[0] = Operand(0u);
2585 new_instr->operands[1] = instr->operands[!i];
2586 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2587 new_instr->definitions[0] = instr->definitions[0];
2588 instr.reset(new_instr.release());
2589 ctx.info[instr->definitions[0].tempId()].label = 0;
2590 return;
2591 }
2592 }
2593 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
2594 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2595 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2596 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2597 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2598 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_or_b32, "120", 1 | 2)) ;
2599 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2);
2600 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
2601 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ;
2602 else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
2603 } else if (instr->opcode == aco_opcode::v_add_u32) {
2604 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
2605 else if (ctx.program->chip_class >= GFX9) {
2606 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2607 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2608 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2609 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2610 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2611 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_add_u32, "120", 1 | 2)) ;
2612 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2);
2613 }
2614 } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
2615 instr->opcode == aco_opcode::v_add_co_u32_e64) {
2616 combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
2617 } else if (instr->opcode == aco_opcode::v_sub_u32 ||
2618 instr->opcode == aco_opcode::v_sub_co_u32 ||
2619 instr->opcode == aco_opcode::v_sub_co_u32_e64) {
2620 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
2621 } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
2622 instr->opcode == aco_opcode::v_subrev_co_u32 ||
2623 instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
2624 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
2625 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
2626 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
2627 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
2628 combine_salu_lshl_add(ctx, instr);
2629 } else if (instr->opcode == aco_opcode::s_not_b32) {
2630 combine_salu_not_bitwise(ctx, instr);
2631 } else if (instr->opcode == aco_opcode::s_not_b64) {
2632 if (combine_inverse_comparison(ctx, instr)) ;
2633 else combine_salu_not_bitwise(ctx, instr);
2634 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
2635 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
2636 if (combine_ordering_test(ctx, instr)) ;
2637 else if (combine_comparison_ordering(ctx, instr)) ;
2638 else if (combine_constant_comparison_ordering(ctx, instr)) ;
2639 else combine_salu_n2(ctx, instr);
2640 } else {
2641 aco_opcode min, max, min3, max3, med3;
2642 bool some_gfx9_only;
2643 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
2644 (!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
2645 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ;
2646 else combine_clamp(ctx, instr, min, max, med3);
2647 }
2648 }
2649 }
2650
2651 bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
2652 {
2653 switch (instr->opcode) {
2654 case aco_opcode::s_and_b32:
2655 case aco_opcode::s_and_b64:
2656 instr->opcode = aco_opcode::s_and_b32;
2657 break;
2658 case aco_opcode::s_or_b32:
2659 case aco_opcode::s_or_b64:
2660 instr->opcode = aco_opcode::s_or_b32;
2661 break;
2662 case aco_opcode::s_xor_b32:
2663 case aco_opcode::s_xor_b64:
2664 instr->opcode = aco_opcode::s_absdiff_i32;
2665 break;
2666 default:
2667 /* Don't transform other instructions. They are very unlikely to appear here. */
2668 return false;
2669 }
2670
2671 for (Operand &op : instr->operands) {
2672 ctx.uses[op.tempId()]--;
2673
2674 if (ctx.info[op.tempId()].is_uniform_bool()) {
2675 /* Just use the uniform boolean temp. */
2676 op.setTemp(ctx.info[op.tempId()].temp);
2677 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
2678 /* Use the SCC definition of the predecessor instruction.
2679 * This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
2680 * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
2681 */
2682 Instruction *pred_instr = ctx.info[op.tempId()].instr;
2683 assert(pred_instr->definitions.size() >= 2);
2684 assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
2685 op.setTemp(pred_instr->definitions[1].getTemp());
2686 } else {
2687 unreachable("Invalid operand on uniform bitwise instruction.");
2688 }
2689
2690 ctx.uses[op.tempId()]++;
2691 }
2692
2693 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
2694 assert(instr->operands[0].regClass() == s1);
2695 assert(instr->operands[1].regClass() == s1);
2696 return true;
2697 }
2698
2699 void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2700 {
2701 const uint32_t threshold = 4;
2702
2703 if (is_dead(ctx.uses, instr.get())) {
2704 instr.reset();
2705 return;
2706 }
2707
2708 /* convert split_vector into a copy or extract_vector if only one definition is ever used */
2709 if (instr->opcode == aco_opcode::p_split_vector) {
2710 unsigned num_used = 0;
2711 unsigned idx = 0;
2712 unsigned split_offset = 0;
2713 for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) {
2714 if (ctx.uses[instr->definitions[i].tempId()]) {
2715 num_used++;
2716 idx = i;
2717 split_offset = offset;
2718 }
2719 }
2720 bool done = false;
2721 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
2722 ctx.uses[instr->operands[0].tempId()] == 1) {
2723 Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
2724
2725 unsigned off = 0;
2726 Operand op;
2727 for (Operand& vec_op : vec->operands) {
2728 if (off == split_offset) {
2729 op = vec_op;
2730 break;
2731 }
2732 off += vec_op.bytes();
2733 }
2734 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
2735 ctx.uses[instr->operands[0].tempId()]--;
2736 for (Operand& vec_op : vec->operands) {
2737 if (vec_op.isTemp())
2738 ctx.uses[vec_op.tempId()]--;
2739 }
2740 if (op.isTemp())
2741 ctx.uses[op.tempId()]++;
2742
2743 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
2744 extract->operands[0] = op;
2745 extract->definitions[0] = instr->definitions[idx];
2746 instr.reset(extract.release());
2747
2748 done = true;
2749 }
2750 }
2751
2752 if (!done && num_used == 1 &&
2753 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
2754 split_offset % instr->definitions[idx].bytes() == 0) {
2755 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
2756 extract->operands[0] = instr->operands[0];
2757 extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes());
2758 extract->definitions[0] = instr->definitions[idx];
2759 instr.reset(extract.release());
2760 }
2761 }
2762
2763 mad_info* mad_info = NULL;
2764 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
2765 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
2766 /* re-check mad instructions */
2767 if (ctx.uses[mad_info->mul_temp_id]) {
2768 ctx.uses[mad_info->mul_temp_id]++;
2769 if (instr->operands[0].isTemp())
2770 ctx.uses[instr->operands[0].tempId()]--;
2771 if (instr->operands[1].isTemp())
2772 ctx.uses[instr->operands[1].tempId()]--;
2773 instr.swap(mad_info->add_instr);
2774 mad_info = NULL;
2775 }
2776 /* check literals */
2777 else if (!instr->usesModifiers()) {
2778 /* FMA can only take literals on GFX10+ */
2779 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
2780 ctx.program->chip_class < GFX10)
2781 return;
2782
2783 bool sgpr_used = false;
2784 uint32_t literal_idx = 0;
2785 uint32_t literal_uses = UINT32_MAX;
2786 for (unsigned i = 0; i < instr->operands.size(); i++)
2787 {
2788 if (instr->operands[i].isConstant() && i > 0) {
2789 literal_uses = UINT32_MAX;
2790 break;
2791 }
2792 if (!instr->operands[i].isTemp())
2793 continue;
2794 /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */
2795 if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) {
2796 if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal()) {
2797 literal_uses = ctx.uses[instr->operands[i].tempId()];
2798 literal_idx = i;
2799 } else {
2800 literal_uses = UINT32_MAX;
2801 }
2802 sgpr_used = true;
2803 /* don't break because we still need to check constants */
2804 } else if (!sgpr_used &&
2805 ctx.info[instr->operands[i].tempId()].is_literal() &&
2806 ctx.uses[instr->operands[i].tempId()] < literal_uses) {
2807 literal_uses = ctx.uses[instr->operands[i].tempId()];
2808 literal_idx = i;
2809 }
2810 }
2811
2812 /* Limit the number of literals to apply to not increase the code
2813 * size too much, but always apply literals for v_mad->v_madak
2814 * because both instructions are 64-bit and this doesn't increase
2815 * code size.
2816 * TODO: try to apply the literals earlier to lower the number of
2817 * uses below threshold
2818 */
2819 if (literal_uses < threshold || literal_idx == 2) {
2820 ctx.uses[instr->operands[literal_idx].tempId()]--;
2821 mad_info->check_literal = true;
2822 mad_info->literal_idx = literal_idx;
2823 return;
2824 }
2825 }
2826 }
2827
2828 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
2829 if (instr->format == Format::PSEUDO_BRANCH &&
2830 instr->operands.size() &&
2831 instr->operands[0].isTemp()) {
2832 ctx.info[instr->operands[0].tempId()].set_scc_needed();
2833 return;
2834 } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
2835 instr->opcode == aco_opcode::s_cselect_b32) &&
2836 instr->operands[2].isTemp()) {
2837 ctx.info[instr->operands[2].tempId()].set_scc_needed();
2838 }
2839
2840 /* check for literals */
2841 if (!instr->isSALU() && !instr->isVALU())
2842 return;
2843
2844 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
2845 if (instr->definitions.size() &&
2846 ctx.uses[instr->definitions[0].tempId()] == 0 &&
2847 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
2848 bool transform_done = to_uniform_bool_instr(ctx, instr);
2849
2850 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
2851 /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
2852 uint32_t def0_id = instr->definitions[0].getTemp().id();
2853 uint32_t def1_id = instr->definitions[1].getTemp().id();
2854 instr->definitions[0].setTemp(Temp(def1_id, s1));
2855 instr->definitions[1].setTemp(Temp(def0_id, s1));
2856 }
2857
2858 return;
2859 }
2860
2861 if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
2862 return; /* some encodings can't ever take literals */
2863
2864 /* we do not apply the literals yet as we don't know if it is profitable */
2865 Operand current_literal(s1);
2866
2867 unsigned literal_id = 0;
2868 unsigned literal_uses = UINT32_MAX;
2869 Operand literal(s1);
2870 unsigned num_operands = 1;
2871 if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr)))
2872 num_operands = instr->operands.size();
2873 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
2874 else if (instr->isVALU() && instr->operands.size() >= 3)
2875 return;
2876
2877 unsigned sgpr_ids[2] = {0, 0};
2878 bool is_literal_sgpr = false;
2879 uint32_t mask = 0;
2880
2881 /* choose a literal to apply */
2882 for (unsigned i = 0; i < num_operands; i++) {
2883 Operand op = instr->operands[i];
2884
2885 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
2886 op.tempId() != sgpr_ids[0])
2887 sgpr_ids[!!sgpr_ids[0]] = op.tempId();
2888
2889 if (op.isLiteral()) {
2890 current_literal = op;
2891 continue;
2892 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal()) {
2893 continue;
2894 }
2895
2896 if (!alu_can_accept_constant(instr->opcode, i))
2897 continue;
2898
2899 if (ctx.uses[op.tempId()] < literal_uses) {
2900 is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
2901 mask = 0;
2902 literal = Operand(ctx.info[op.tempId()].val);
2903 literal_uses = ctx.uses[op.tempId()];
2904 literal_id = op.tempId();
2905 }
2906
2907 mask |= (op.tempId() == literal_id) << i;
2908 }
2909
2910
2911 /* don't go over the constant bus limit */
2912 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2913 instr->opcode == aco_opcode::v_lshrrev_b64 ||
2914 instr->opcode == aco_opcode::v_ashrrev_i64;
2915 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
2916 if (ctx.program->chip_class >= GFX10 && !is_shift64)
2917 const_bus_limit = 2;
2918
2919 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2920 if (num_sgprs == const_bus_limit && !is_literal_sgpr)
2921 return;
2922
2923 if (literal_id && literal_uses < threshold &&
2924 (current_literal.isUndefined() ||
2925 (current_literal.size() == literal.size() &&
2926 current_literal.constantValue() == literal.constantValue()))) {
2927 /* mark the literal to be applied */
2928 while (mask) {
2929 unsigned i = u_bit_scan(&mask);
2930 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
2931 ctx.uses[instr->operands[i].tempId()]--;
2932 }
2933 }
2934 }
2935
2936
2937 void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2938 {
2939 /* Cleanup Dead Instructions */
2940 if (!instr)
2941 return;
2942
2943 /* apply literals on MAD */
2944 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
2945 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
2946 if (info->check_literal &&
2947 (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
2948 aco_ptr<Instruction> new_mad;
2949
2950 aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
2951 if (instr->opcode == aco_opcode::v_fma_f32)
2952 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
2953 else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
2954 new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
2955 else if (instr->opcode == aco_opcode::v_fma_f16)
2956 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
2957
2958 new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
2959 if (info->literal_idx == 2) { /* add literal -> madak */
2960 new_mad->operands[0] = instr->operands[0];
2961 new_mad->operands[1] = instr->operands[1];
2962 } else { /* mul literal -> madmk */
2963 new_mad->operands[0] = instr->operands[1 - info->literal_idx];
2964 new_mad->operands[1] = instr->operands[2];
2965 }
2966 new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val);
2967 new_mad->definitions[0] = instr->definitions[0];
2968 ctx.instructions.emplace_back(std::move(new_mad));
2969 return;
2970 }
2971 }
2972
2973 /* apply literals on other SALU/VALU */
2974 if (instr->isSALU() || instr->isVALU()) {
2975 for (unsigned i = 0; i < instr->operands.size(); i++) {
2976 Operand op = instr->operands[i];
2977 if (op.isTemp() && ctx.info[op.tempId()].is_literal() && ctx.uses[op.tempId()] == 0) {
2978 Operand literal(ctx.info[op.tempId()].val);
2979 if (instr->isVALU() && i > 0)
2980 to_VOP3(ctx, instr);
2981 instr->operands[i] = literal;
2982 }
2983 }
2984 }
2985
2986 ctx.instructions.emplace_back(std::move(instr));
2987 }
2988
2989
2990 void optimize(Program* program)
2991 {
2992 opt_ctx ctx;
2993 ctx.program = program;
2994 std::vector<ssa_info> info(program->peekAllocationId());
2995 ctx.info = info.data();
2996
2997 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
2998 for (Block& block : program->blocks) {
2999 for (aco_ptr<Instruction>& instr : block.instructions)
3000 label_instruction(ctx, block, instr);
3001 }
3002
3003 ctx.uses = dead_code_analysis(program);
3004
3005 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
3006 for (Block& block : program->blocks) {
3007 for (aco_ptr<Instruction>& instr : block.instructions)
3008 combine_instruction(ctx, block, instr);
3009 }
3010
3011 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
3012 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) {
3013 Block* block = &(*it);
3014 for (std::vector<aco_ptr<Instruction>>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it)
3015 select_instruction(ctx, *it);
3016 }
3017
3018 /* 4. Add literals to instructions */
3019 for (Block& block : program->blocks) {
3020 ctx.instructions.clear();
3021 for (aco_ptr<Instruction>& instr : block.instructions)
3022 apply_literals(ctx, instr);
3023 block.instructions.swap(ctx.instructions);
3024 }
3025
3026 }
3027
3028 }