aco: create 16-bit input and output modifiers
[mesa.git] / src / amd / compiler / aco_optimizer.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <algorithm>
29 #include <math.h>
30
31 #include "aco_ir.h"
32 #include "util/half_float.h"
33 #include "util/u_math.h"
34
35 namespace aco {
36
37 /**
38 * The optimizer works in 4 phases:
39 * (1) The first pass collects information for each ssa-def,
40 * propagates reg->reg operands of the same type, inline constants
41 * and neg/abs input modifiers.
42 * (2) The second pass combines instructions like mad, omod, clamp and
43 * propagates sgpr's on VALU instructions.
44 * This pass depends on information collected in the first pass.
45 * (3) The third pass goes backwards, and selects instructions,
46 * i.e. decides if a mad instruction is profitable and eliminates dead code.
47 * (4) The fourth pass cleans up the sequence: literals get applied and dead
48 * instructions are removed from the sequence.
49 */
50
51
52 struct mad_info {
53 aco_ptr<Instruction> add_instr;
54 uint32_t mul_temp_id;
55 uint32_t literal_idx;
56 bool check_literal;
57
58 mad_info(aco_ptr<Instruction> instr, uint32_t id)
59 : add_instr(std::move(instr)), mul_temp_id(id), check_literal(false) {}
60 };
61
62 enum Label {
63 label_vec = 1 << 0,
64 label_constant = 1 << 1,
65 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
66 * 32-bit operations but this shouldn't cause any issues because we don't
67 * look through any conversions */
68 label_abs = 1 << 2,
69 label_neg = 1 << 3,
70 label_mul = 1 << 4,
71 label_temp = 1 << 5,
72 label_literal = 1 << 6,
73 label_mad = 1 << 7,
74 label_omod2 = 1 << 8,
75 label_omod4 = 1 << 9,
76 label_omod5 = 1 << 10,
77 label_omod_success = 1 << 11,
78 label_clamp = 1 << 12,
79 label_clamp_success = 1 << 13,
80 label_undefined = 1 << 14,
81 label_vcc = 1 << 15,
82 label_b2f = 1 << 16,
83 label_add_sub = 1 << 17,
84 label_bitwise = 1 << 18,
85 label_minmax = 1 << 19,
86 label_fcmp = 1 << 20,
87 label_uniform_bool = 1 << 21,
88 label_constant_64bit = 1 << 22,
89 label_uniform_bitwise = 1 << 23,
90 label_scc_invert = 1 << 24,
91 label_vcc_hint = 1 << 25,
92 label_scc_needed = 1 << 26,
93 label_b2i = 1 << 27,
94 };
95
96 static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success |
97 label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp;
98 static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
99 label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert | label_b2i;
100 static constexpr uint32_t val_labels = label_constant | label_constant_64bit | label_literal | label_mad;
101
102 struct ssa_info {
103 uint32_t val;
104 union {
105 Temp temp;
106 Instruction* instr;
107 };
108 uint32_t label;
109
110 ssa_info() : label(0) {}
111
112 void add_label(Label new_label)
113 {
114 /* Since all labels which use "instr" use it for the same thing
115 * (indicating the defining instruction), there is no need to clear
116 * any other instr labels. */
117 if (new_label & instr_labels)
118 label &= ~temp_labels; /* instr and temp alias */
119
120 if (new_label & temp_labels) {
121 label &= ~temp_labels;
122 label &= ~instr_labels; /* instr and temp alias */
123 }
124
125 if (new_label & val_labels)
126 label &= ~val_labels;
127
128 label |= new_label;
129 }
130
131 void set_vec(Instruction* vec)
132 {
133 add_label(label_vec);
134 instr = vec;
135 }
136
137 bool is_vec()
138 {
139 return label & label_vec;
140 }
141
142 void set_constant(uint32_t constant)
143 {
144 add_label(label_constant);
145 val = constant;
146 }
147
148 bool is_constant()
149 {
150 return label & label_constant;
151 }
152
153 void set_constant_64bit(uint32_t constant)
154 {
155 add_label(label_constant_64bit);
156 val = constant;
157 }
158
159 bool is_constant_64bit()
160 {
161 return label & label_constant_64bit;
162 }
163
164 void set_abs(Temp abs_temp)
165 {
166 add_label(label_abs);
167 temp = abs_temp;
168 }
169
170 bool is_abs()
171 {
172 return label & label_abs;
173 }
174
175 void set_neg(Temp neg_temp)
176 {
177 add_label(label_neg);
178 temp = neg_temp;
179 }
180
181 bool is_neg()
182 {
183 return label & label_neg;
184 }
185
186 void set_neg_abs(Temp neg_abs_temp)
187 {
188 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
189 temp = neg_abs_temp;
190 }
191
192 void set_mul(Instruction* mul)
193 {
194 add_label(label_mul);
195 instr = mul;
196 }
197
198 bool is_mul()
199 {
200 return label & label_mul;
201 }
202
203 void set_temp(Temp tmp)
204 {
205 add_label(label_temp);
206 temp = tmp;
207 }
208
209 bool is_temp()
210 {
211 return label & label_temp;
212 }
213
214 void set_literal(uint32_t lit)
215 {
216 add_label(label_literal);
217 val = lit;
218 }
219
220 bool is_literal()
221 {
222 return label & label_literal;
223 }
224
225 void set_mad(Instruction* mad, uint32_t mad_info_idx)
226 {
227 add_label(label_mad);
228 val = mad_info_idx;
229 instr = mad;
230 }
231
232 bool is_mad()
233 {
234 return label & label_mad;
235 }
236
237 void set_omod2(Temp def)
238 {
239 add_label(label_omod2);
240 temp = def;
241 }
242
243 bool is_omod2()
244 {
245 return label & label_omod2;
246 }
247
248 void set_omod4(Temp def)
249 {
250 add_label(label_omod4);
251 temp = def;
252 }
253
254 bool is_omod4()
255 {
256 return label & label_omod4;
257 }
258
259 void set_omod5(Temp def)
260 {
261 add_label(label_omod5);
262 temp = def;
263 }
264
265 bool is_omod5()
266 {
267 return label & label_omod5;
268 }
269
270 void set_omod_success(Instruction* omod_instr)
271 {
272 add_label(label_omod_success);
273 instr = omod_instr;
274 }
275
276 bool is_omod_success()
277 {
278 return label & label_omod_success;
279 }
280
281 void set_clamp(Temp def)
282 {
283 add_label(label_clamp);
284 temp = def;
285 }
286
287 bool is_clamp()
288 {
289 return label & label_clamp;
290 }
291
292 void set_clamp_success(Instruction* clamp_instr)
293 {
294 add_label(label_clamp_success);
295 instr = clamp_instr;
296 }
297
298 bool is_clamp_success()
299 {
300 return label & label_clamp_success;
301 }
302
303 void set_undefined()
304 {
305 add_label(label_undefined);
306 }
307
308 bool is_undefined()
309 {
310 return label & label_undefined;
311 }
312
313 void set_vcc(Temp vcc)
314 {
315 add_label(label_vcc);
316 temp = vcc;
317 }
318
319 bool is_vcc()
320 {
321 return label & label_vcc;
322 }
323
324 bool is_constant_or_literal()
325 {
326 return is_constant() || is_literal();
327 }
328
329 void set_b2f(Temp val)
330 {
331 add_label(label_b2f);
332 temp = val;
333 }
334
335 bool is_b2f()
336 {
337 return label & label_b2f;
338 }
339
340 void set_add_sub(Instruction *add_sub_instr)
341 {
342 add_label(label_add_sub);
343 instr = add_sub_instr;
344 }
345
346 bool is_add_sub()
347 {
348 return label & label_add_sub;
349 }
350
351 void set_bitwise(Instruction *bitwise_instr)
352 {
353 add_label(label_bitwise);
354 instr = bitwise_instr;
355 }
356
357 bool is_bitwise()
358 {
359 return label & label_bitwise;
360 }
361
362 void set_uniform_bitwise()
363 {
364 add_label(label_uniform_bitwise);
365 }
366
367 bool is_uniform_bitwise()
368 {
369 return label & label_uniform_bitwise;
370 }
371
372 void set_minmax(Instruction *minmax_instr)
373 {
374 add_label(label_minmax);
375 instr = minmax_instr;
376 }
377
378 bool is_minmax()
379 {
380 return label & label_minmax;
381 }
382
383 void set_fcmp(Instruction *fcmp_instr)
384 {
385 add_label(label_fcmp);
386 instr = fcmp_instr;
387 }
388
389 bool is_fcmp()
390 {
391 return label & label_fcmp;
392 }
393
394 void set_scc_needed()
395 {
396 add_label(label_scc_needed);
397 }
398
399 bool is_scc_needed()
400 {
401 return label & label_scc_needed;
402 }
403
404 void set_scc_invert(Temp scc_inv)
405 {
406 add_label(label_scc_invert);
407 temp = scc_inv;
408 }
409
410 bool is_scc_invert()
411 {
412 return label & label_scc_invert;
413 }
414
415 void set_uniform_bool(Temp uniform_bool)
416 {
417 add_label(label_uniform_bool);
418 temp = uniform_bool;
419 }
420
421 bool is_uniform_bool()
422 {
423 return label & label_uniform_bool;
424 }
425
426 void set_vcc_hint()
427 {
428 add_label(label_vcc_hint);
429 }
430
431 bool is_vcc_hint()
432 {
433 return label & label_vcc_hint;
434 }
435
436 void set_b2i(Temp val)
437 {
438 add_label(label_b2i);
439 temp = val;
440 }
441
442 bool is_b2i()
443 {
444 return label & label_b2i;
445 }
446
447 };
448
449 struct opt_ctx {
450 Program* program;
451 std::vector<aco_ptr<Instruction>> instructions;
452 ssa_info* info;
453 std::pair<uint32_t,Temp> last_literal;
454 std::vector<mad_info> mad_infos;
455 std::vector<uint16_t> uses;
456 };
457
458 bool can_swap_operands(aco_ptr<Instruction>& instr)
459 {
460 if (instr->operands[0].isConstant() ||
461 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
462 return false;
463
464 switch (instr->opcode) {
465 case aco_opcode::v_add_f32:
466 case aco_opcode::v_mul_f32:
467 case aco_opcode::v_or_b32:
468 case aco_opcode::v_and_b32:
469 case aco_opcode::v_xor_b32:
470 case aco_opcode::v_max_f32:
471 case aco_opcode::v_min_f32:
472 case aco_opcode::v_max_i32:
473 case aco_opcode::v_min_i32:
474 case aco_opcode::v_max_u32:
475 case aco_opcode::v_min_u32:
476 case aco_opcode::v_cmp_eq_f32:
477 case aco_opcode::v_cmp_lg_f32:
478 return true;
479 case aco_opcode::v_sub_f32:
480 instr->opcode = aco_opcode::v_subrev_f32;
481 return true;
482 case aco_opcode::v_cmp_lt_f32:
483 instr->opcode = aco_opcode::v_cmp_gt_f32;
484 return true;
485 case aco_opcode::v_cmp_ge_f32:
486 instr->opcode = aco_opcode::v_cmp_le_f32;
487 return true;
488 case aco_opcode::v_cmp_lt_i32:
489 instr->opcode = aco_opcode::v_cmp_gt_i32;
490 return true;
491 default:
492 return false;
493 }
494 }
495
496 bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
497 {
498 if (instr->isVOP3())
499 return true;
500
501 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
502 return false;
503
504 if (instr->isDPP() || instr->isSDWA())
505 return false;
506
507 return instr->opcode != aco_opcode::v_madmk_f32 &&
508 instr->opcode != aco_opcode::v_madak_f32 &&
509 instr->opcode != aco_opcode::v_madmk_f16 &&
510 instr->opcode != aco_opcode::v_madak_f16 &&
511 instr->opcode != aco_opcode::v_fmamk_f32 &&
512 instr->opcode != aco_opcode::v_fmaak_f32 &&
513 instr->opcode != aco_opcode::v_fmamk_f16 &&
514 instr->opcode != aco_opcode::v_fmaak_f16 &&
515 instr->opcode != aco_opcode::v_readlane_b32 &&
516 instr->opcode != aco_opcode::v_writelane_b32 &&
517 instr->opcode != aco_opcode::v_readfirstlane_b32;
518 }
519
520 bool can_apply_sgprs(aco_ptr<Instruction>& instr)
521 {
522 return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
523 instr->opcode != aco_opcode::v_readlane_b32 &&
524 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
525 instr->opcode != aco_opcode::v_writelane_b32 &&
526 instr->opcode != aco_opcode::v_writelane_b32_e64;
527 }
528
529 void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
530 {
531 if (instr->isVOP3())
532 return;
533
534 aco_ptr<Instruction> tmp = std::move(instr);
535 Format format = asVOP3(tmp->format);
536 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
537 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
538 for (unsigned i = 0; i < instr->definitions.size(); i++) {
539 instr->definitions[i] = tmp->definitions[i];
540 if (instr->definitions[i].isTemp()) {
541 ssa_info& info = ctx.info[instr->definitions[i].tempId()];
542 if (info.label & instr_labels && info.instr == tmp.get())
543 info.instr = instr.get();
544 }
545 }
546 }
547
548 /* only covers special cases */
549 bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
550 {
551 switch (opcode) {
552 case aco_opcode::v_interp_p2_f32:
553 case aco_opcode::v_mac_f32:
554 case aco_opcode::v_writelane_b32:
555 case aco_opcode::v_writelane_b32_e64:
556 case aco_opcode::v_cndmask_b32:
557 return operand != 2;
558 case aco_opcode::s_addk_i32:
559 case aco_opcode::s_mulk_i32:
560 case aco_opcode::p_wqm:
561 case aco_opcode::p_extract_vector:
562 case aco_opcode::p_split_vector:
563 case aco_opcode::v_readlane_b32:
564 case aco_opcode::v_readlane_b32_e64:
565 case aco_opcode::v_readfirstlane_b32:
566 return operand != 0;
567 default:
568 return true;
569 }
570 }
571
572 bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
573 {
574 if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
575 instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
576 return operand != 1;
577 return true;
578 }
579
580 /* check constant bus and literal limitations */
581 bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
582 {
583 int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
584 Operand literal32(s1);
585 Operand literal64(s2);
586 unsigned num_sgprs = 0;
587 unsigned sgpr[] = {0, 0};
588
589 for (unsigned i = 0; i < num_operands; i++) {
590 Operand op = operands[i];
591
592 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
593 /* two reads of the same SGPR count as 1 to the limit */
594 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
595 if (num_sgprs < 2)
596 sgpr[num_sgprs++] = op.tempId();
597 limit--;
598 if (limit < 0)
599 return false;
600 }
601 } else if (op.isLiteral()) {
602 if (ctx.program->chip_class < GFX10)
603 return false;
604
605 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
606 return false;
607 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
608 return false;
609
610 /* Any number of 32-bit literals counts as only 1 to the limit. Same
611 * (but separately) for 64-bit literals. */
612 if (op.size() == 1 && literal32.isUndefined()) {
613 limit--;
614 literal32 = op;
615 } else if (op.size() == 2 && literal64.isUndefined()) {
616 limit--;
617 literal64 = op;
618 }
619
620 if (limit < 0)
621 return false;
622 }
623 }
624
625 return true;
626 }
627
628 bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
629 {
630 Operand op = instr->operands[op_index];
631
632 if (!op.isTemp())
633 return false;
634 Temp tmp = op.getTemp();
635 if (!ctx.info[tmp.id()].is_add_sub())
636 return false;
637
638 Instruction *add_instr = ctx.info[tmp.id()].instr;
639
640 switch (add_instr->opcode) {
641 case aco_opcode::v_add_u32:
642 case aco_opcode::v_add_co_u32:
643 case aco_opcode::v_add_co_u32_e64:
644 case aco_opcode::s_add_i32:
645 case aco_opcode::s_add_u32:
646 break;
647 default:
648 return false;
649 }
650
651 if (add_instr->usesModifiers())
652 return false;
653
654 for (unsigned i = 0; i < 2; i++) {
655 if (add_instr->operands[i].isConstant()) {
656 *offset = add_instr->operands[i].constantValue();
657 } else if (add_instr->operands[i].isTemp() &&
658 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal()) {
659 *offset = ctx.info[add_instr->operands[i].tempId()].val;
660 } else {
661 continue;
662 }
663 if (!add_instr->operands[!i].isTemp())
664 continue;
665
666 uint32_t offset2 = 0;
667 if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) {
668 *offset += offset2;
669 } else {
670 *base = add_instr->operands[!i].getTemp();
671 }
672 return true;
673 }
674
675 return false;
676 }
677
678 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
679 {
680 if (instr->format == Format::PSEUDO)
681 return instr->operands[index].bytes() * 8u;
682 else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32)
683 return index == 2 ? 64 : 32;
684 else if (instr->isVALU() || instr->isSALU())
685 return instr_info.operand_size[(int)instr->opcode];
686 else
687 return 0;
688 }
689
690 Operand get_constant_op(opt_ctx &ctx, uint32_t val, bool is64bit = false)
691 {
692 // TODO: this functions shouldn't be needed if we store Operand instead of value.
693 Operand op(val, is64bit);
694 if (val == 0x3e22f983 && ctx.program->chip_class >= GFX8)
695 op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */
696 return op;
697 }
698
699 bool fixed_to_exec(Operand op)
700 {
701 return op.isFixed() && op.physReg() == exec;
702 }
703
704 void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
705 {
706 if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
707 ASSERTED bool all_const = false;
708 for (Operand& op : instr->operands)
709 all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal());
710 perfwarn(all_const, "All instruction operands are constant", instr.get());
711 }
712
713 for (unsigned i = 0; i < instr->operands.size(); i++)
714 {
715 if (!instr->operands[i].isTemp())
716 continue;
717
718 ssa_info info = ctx.info[instr->operands[i].tempId()];
719 /* propagate undef */
720 if (info.is_undefined() && is_phi(instr))
721 instr->operands[i] = Operand(instr->operands[i].regClass());
722 /* propagate reg->reg of same type */
723 if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
724 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
725 info = ctx.info[info.temp.id()];
726 }
727
728 /* SALU / PSEUDO: propagate inline constants */
729 if (instr->isSALU() || instr->format == Format::PSEUDO) {
730 const bool is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(),
731 [] (const Definition& def) { return def.regClass().is_subdword();});
732 // TODO: optimize SGPR and constant propagation for subdword pseudo instructions on gfx9+
733 if (is_subdword)
734 continue;
735
736 if (info.is_temp() && info.temp.type() == RegType::sgpr) {
737 instr->operands[i].setTemp(info.temp);
738 info = ctx.info[info.temp.id()];
739 } else if (info.is_temp() && info.temp.type() == RegType::vgpr) {
740 /* propagate vgpr if it can take it */
741 switch (instr->opcode) {
742 case aco_opcode::p_create_vector:
743 case aco_opcode::p_split_vector:
744 case aco_opcode::p_extract_vector:
745 case aco_opcode::p_phi: {
746 const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(),
747 [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;});
748 if (all_vgpr) {
749 instr->operands[i] = Operand(info.temp);
750 info = ctx.info[info.temp.id()];
751 }
752 break;
753 }
754 default:
755 break;
756 }
757 }
758 if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) &&
759 !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
760 instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit());
761 continue;
762 }
763 }
764
765 /* VALU: propagate neg, abs & inline constants */
766 else if (instr->isVALU()) {
767 if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
768 instr->operands[i].setTemp(info.temp);
769 info = ctx.info[info.temp.id()];
770 }
771
772 /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */
773 unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
774 can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
775
776 if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
777 if (!instr->isDPP())
778 to_VOP3(ctx, instr);
779 instr->operands[i] = Operand(info.temp);
780 if (instr->isDPP())
781 static_cast<DPP_instruction*>(instr.get())->abs[i] = true;
782 else
783 static_cast<VOP3A_instruction*>(instr.get())->abs[i] = true;
784 }
785 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
786 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
787 instr->operands[i].setTemp(info.temp);
788 continue;
789 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
790 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
791 instr->operands[i].setTemp(info.temp);
792 continue;
793 } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && can_use_mod) {
794 if (!instr->isDPP())
795 to_VOP3(ctx, instr);
796 instr->operands[i].setTemp(info.temp);
797 if (instr->isDPP())
798 static_cast<DPP_instruction*>(instr.get())->neg[i] = true;
799 else
800 static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true;
801 continue;
802 }
803 if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) {
804 Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit());
805 perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
806 if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
807 instr->operands[i] = op;
808 continue;
809 } else if (!instr->isVOP3() && can_swap_operands(instr)) {
810 instr->operands[i] = instr->operands[0];
811 instr->operands[0] = op;
812 continue;
813 } else if (can_use_VOP3(ctx, instr)) {
814 to_VOP3(ctx, instr);
815 instr->operands[i] = op;
816 continue;
817 }
818 }
819 }
820
821 /* MUBUF: propagate constants and combine additions */
822 else if (instr->format == Format::MUBUF) {
823 MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
824 Temp base;
825 uint32_t offset;
826 while (info.is_temp())
827 info = ctx.info[info.temp.id()];
828
829 if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
830 assert(!mubuf->idxen);
831 instr->operands[1] = Operand(v1);
832 mubuf->offset += info.val;
833 mubuf->offen = false;
834 continue;
835 } else if (i == 2 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) {
836 instr->operands[2] = Operand((uint32_t) 0);
837 mubuf->offset += info.val;
838 continue;
839 } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
840 assert(!mubuf->idxen);
841 instr->operands[1].setTemp(base);
842 mubuf->offset += offset;
843 continue;
844 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
845 instr->operands[i].setTemp(base);
846 mubuf->offset += offset;
847 continue;
848 }
849 }
850
851 /* DS: combine additions */
852 else if (instr->format == Format::DS) {
853
854 DS_instruction *ds = static_cast<DS_instruction *>(instr.get());
855 Temp base;
856 uint32_t offset;
857 bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
858 if (has_usable_ds_offset &&
859 i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) &&
860 base.regClass() == instr->operands[i].regClass() &&
861 instr->opcode != aco_opcode::ds_swizzle_b32) {
862 if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
863 instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
864 unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3;
865 unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2;
866
867 if ((offset & mask) == 0 &&
868 ds->offset0 + (offset >> shifts) <= 255 &&
869 ds->offset1 + (offset >> shifts) <= 255) {
870 instr->operands[i].setTemp(base);
871 ds->offset0 += offset >> shifts;
872 ds->offset1 += offset >> shifts;
873 }
874 } else {
875 if (ds->offset0 + offset <= 65535) {
876 instr->operands[i].setTemp(base);
877 ds->offset0 += offset;
878 }
879 }
880 }
881 }
882
883 /* SMEM: propagate constants and combine additions */
884 else if (instr->format == Format::SMEM) {
885
886 SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr.get());
887 Temp base;
888 uint32_t offset;
889 if (i == 1 && info.is_constant_or_literal() &&
890 ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
891 (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
892 (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
893 instr->operands[i] = Operand(info.val);
894 continue;
895 } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
896 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
897 if (soe &&
898 (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal() ||
899 ctx.info[smem->operands.back().tempId()].val != 0)) {
900 continue;
901 }
902 if (soe) {
903 smem->operands[1] = Operand(offset);
904 smem->operands.back() = Operand(base);
905 } else {
906 SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size());
907 new_instr->operands[0] = smem->operands[0];
908 new_instr->operands[1] = Operand(offset);
909 if (smem->definitions.empty())
910 new_instr->operands[2] = smem->operands[2];
911 new_instr->operands.back() = Operand(base);
912 if (!smem->definitions.empty())
913 new_instr->definitions[0] = smem->definitions[0];
914 new_instr->can_reorder = smem->can_reorder;
915 new_instr->barrier = smem->barrier;
916 new_instr->glc = smem->glc;
917 new_instr->dlc = smem->dlc;
918 new_instr->nv = smem->nv;
919 new_instr->disable_wqm = smem->disable_wqm;
920 instr.reset(new_instr);
921 smem = static_cast<SMEM_instruction *>(instr.get());
922 }
923 continue;
924 }
925 }
926
927 else if (instr->format == Format::PSEUDO_BRANCH) {
928 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
929 /* Flip the branch instruction to get rid of the scc_invert instruction */
930 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
931 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
932 }
933 }
934 }
935
936 /* if this instruction doesn't define anything, return */
937 if (instr->definitions.empty())
938 return;
939
940 switch (instr->opcode) {
941 case aco_opcode::p_create_vector: {
942 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
943 instr->operands[0].regClass() == instr->definitions[0].regClass();
944 if (copy_prop) {
945 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
946 break;
947 }
948
949 unsigned num_ops = instr->operands.size();
950 for (const Operand& op : instr->operands) {
951 if (op.isTemp() && ctx.info[op.tempId()].is_vec())
952 num_ops += ctx.info[op.tempId()].instr->operands.size() - 1;
953 }
954 if (num_ops != instr->operands.size()) {
955 aco_ptr<Instruction> old_vec = std::move(instr);
956 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_ops, 1));
957 instr->definitions[0] = old_vec->definitions[0];
958 unsigned k = 0;
959 for (Operand& old_op : old_vec->operands) {
960 if (old_op.isTemp() && ctx.info[old_op.tempId()].is_vec()) {
961 for (unsigned j = 0; j < ctx.info[old_op.tempId()].instr->operands.size(); j++) {
962 Operand op = ctx.info[old_op.tempId()].instr->operands[j];
963 if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
964 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
965 op.setTemp(ctx.info[op.tempId()].temp);
966 instr->operands[k++] = op;
967 }
968 } else {
969 instr->operands[k++] = old_op;
970 }
971 }
972 assert(k == num_ops);
973 }
974
975 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
976 break;
977 }
978 case aco_opcode::p_split_vector: {
979 if (!ctx.info[instr->operands[0].tempId()].is_vec())
980 break;
981 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
982 unsigned split_offset = 0;
983 unsigned vec_offset = 0;
984 unsigned vec_index = 0;
985 for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) {
986 while (vec_offset < split_offset && vec_index < vec->operands.size())
987 vec_offset += vec->operands[vec_index++].bytes();
988
989 if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
990 continue;
991
992 Operand vec_op = vec->operands[vec_index];
993 if (vec_op.isConstant()) {
994 if (vec_op.isLiteral())
995 ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue());
996 else if (vec_op.size() == 1)
997 ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue());
998 else if (vec_op.size() == 2)
999 ctx.info[instr->definitions[i].tempId()].set_constant_64bit(vec_op.constantValue());
1000 } else if (vec_op.isUndefined()) {
1001 ctx.info[instr->definitions[i].tempId()].set_undefined();
1002 } else {
1003 assert(vec_op.isTemp());
1004 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1005 }
1006 }
1007 break;
1008 }
1009 case aco_opcode::p_extract_vector: { /* mov */
1010 if (!ctx.info[instr->operands[0].tempId()].is_vec())
1011 break;
1012
1013 /* check if we index directly into a vector element */
1014 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1015 const unsigned index = instr->operands[1].constantValue();
1016 const unsigned dst_offset = index * instr->definitions[0].bytes();
1017 unsigned offset = 0;
1018
1019 for (const Operand& op : vec->operands) {
1020 if (offset < dst_offset) {
1021 offset += op.bytes();
1022 continue;
1023 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1024 break;
1025 }
1026
1027 /* convert this extract into a copy instruction */
1028 instr->opcode = aco_opcode::p_parallelcopy;
1029 instr->operands.pop_back();
1030 instr->operands[0] = op;
1031
1032 if (op.isConstant()) {
1033 if (op.isLiteral())
1034 ctx.info[instr->definitions[0].tempId()].set_literal(op.constantValue());
1035 else if (op.size() == 1)
1036 ctx.info[instr->definitions[0].tempId()].set_constant(op.constantValue());
1037 else if (op.size() == 2)
1038 ctx.info[instr->definitions[0].tempId()].set_constant_64bit(op.constantValue());
1039 } else if (op.isUndefined()) {
1040 ctx.info[instr->definitions[0].tempId()].set_undefined();
1041 } else {
1042 assert(op.isTemp());
1043 ctx.info[instr->definitions[0].tempId()].set_temp(op.getTemp());
1044 }
1045 break;
1046 }
1047 break;
1048 }
1049 case aco_opcode::s_mov_b32: /* propagate */
1050 case aco_opcode::s_mov_b64:
1051 case aco_opcode::v_mov_b32:
1052 case aco_opcode::p_as_uniform:
1053 if (instr->definitions[0].isFixed()) {
1054 /* don't copy-propagate copies into fixed registers */
1055 } else if (instr->usesModifiers()) {
1056 // TODO
1057 } else if (instr->operands[0].isConstant()) {
1058 if (instr->operands[0].isLiteral())
1059 ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue());
1060 else if (instr->operands[0].size() == 1)
1061 ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue());
1062 else if (instr->operands[0].size() == 2)
1063 ctx.info[instr->definitions[0].tempId()].set_constant_64bit(instr->operands[0].constantValue());
1064 } else if (instr->operands[0].isTemp()) {
1065 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1066 } else {
1067 assert(instr->operands[0].isFixed());
1068 }
1069 break;
1070 case aco_opcode::p_is_helper:
1071 if (!ctx.program->needs_wqm)
1072 ctx.info[instr->definitions[0].tempId()].set_constant(0u);
1073 break;
1074 case aco_opcode::s_movk_i32: {
1075 uint32_t v = static_cast<SOPK_instruction*>(instr.get())->imm;
1076 v = v & 0x8000 ? (v | 0xffff0000) : v;
1077 if (v <= 64 || v >= 0xfffffff0)
1078 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1079 else
1080 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1081 break;
1082 }
1083 case aco_opcode::v_bfrev_b32:
1084 case aco_opcode::s_brev_b32: {
1085 if (instr->operands[0].isConstant()) {
1086 uint32_t v = util_bitreverse(instr->operands[0].constantValue());
1087 if (v <= 64 || v >= 0xfffffff0)
1088 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1089 else
1090 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1091 }
1092 break;
1093 }
1094 case aco_opcode::s_bfm_b32: {
1095 if (instr->operands[0].isConstant() && instr->operands[1].isConstant()) {
1096 unsigned size = instr->operands[0].constantValue() & 0x1f;
1097 unsigned start = instr->operands[1].constantValue() & 0x1f;
1098 uint32_t v = ((1u << size) - 1u) << start;
1099 if (v <= 64 || v >= 0xfffffff0)
1100 ctx.info[instr->definitions[0].tempId()].set_constant(v);
1101 else
1102 ctx.info[instr->definitions[0].tempId()].set_literal(v);
1103 }
1104 break;
1105 }
1106 case aco_opcode::v_mul_f16:
1107 case aco_opcode::v_mul_f32: { /* omod */
1108 /* TODO: try to move the negate/abs modifier to the consumer instead */
1109 if (instr->usesModifiers())
1110 break;
1111
1112 bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1113
1114 for (unsigned i = 0; i < 2; i++) {
1115 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1116 if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1117 ctx.info[instr->operands[i].tempId()].set_omod2(instr->definitions[0].getTemp());
1118 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1119 ctx.info[instr->operands[i].tempId()].set_omod4(instr->definitions[0].getTemp());
1120 } else if (instr->operands[!i].constantValue() == (fp16 ? 0xb800 : 0x3f000000)) { /* 0.5 */
1121 ctx.info[instr->operands[i].tempId()].set_omod5(instr->definitions[0].getTemp());
1122 } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) &&
1123 !(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32)) { /* 1.0 */
1124 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
1125 } else {
1126 continue;
1127 }
1128 break;
1129 }
1130 }
1131 break;
1132 }
1133 case aco_opcode::v_and_b32: { /* abs */
1134 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1135 instr->operands[1].getTemp().type() == RegType::vgpr &&
1136 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) ||
1137 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu))))
1138 ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp());
1139 else
1140 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1141 break;
1142 }
1143 case aco_opcode::v_xor_b32: { /* neg */
1144 if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
1145 ((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) ||
1146 (instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) {
1147 if (ctx.info[instr->operands[1].tempId()].is_neg()) {
1148 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1149 } else if (instr->operands[1].getTemp().type() == RegType::vgpr) {
1150 if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */
1151 instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp);
1152 instr->opcode = aco_opcode::v_or_b32;
1153 ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp());
1154 } else {
1155 ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp());
1156 }
1157 }
1158 } else {
1159 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1160 }
1161 break;
1162 }
1163 case aco_opcode::v_med3_f16:
1164 case aco_opcode::v_med3_f32: { /* clamp */
1165 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1166 if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1167 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1168 vop3->omod != 0 || vop3->opsel != 0)
1169 break;
1170
1171 unsigned idx = 0;
1172 bool found_zero = false, found_one = false;
1173 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1174 for (unsigned i = 0; i < 3; i++)
1175 {
1176 if (instr->operands[i].constantEquals(0))
1177 found_zero = true;
1178 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1179 found_one = true;
1180 else
1181 idx = i;
1182 }
1183 if (found_zero && found_one && instr->operands[idx].isTemp()) {
1184 ctx.info[instr->operands[idx].tempId()].set_clamp(instr->definitions[0].getTemp());
1185 }
1186 break;
1187 }
1188 case aco_opcode::v_cndmask_b32:
1189 if (instr->operands[0].constantEquals(0) &&
1190 instr->operands[1].constantEquals(0xFFFFFFFF))
1191 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1192 else if (instr->operands[0].constantEquals(0) &&
1193 instr->operands[1].constantEquals(0x3f800000u))
1194 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1195 else if (instr->operands[0].constantEquals(0) &&
1196 instr->operands[1].constantEquals(1))
1197 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1198
1199 ctx.info[instr->operands[2].tempId()].set_vcc_hint();
1200 break;
1201 case aco_opcode::v_cmp_lg_u32:
1202 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1203 instr->operands[0].constantEquals(0) &&
1204 instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc())
1205 ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
1206 break;
1207 case aco_opcode::p_phi:
1208 case aco_opcode::p_linear_phi: {
1209 /* lower_bool_phis() can create phis like this */
1210 bool all_same_temp = instr->operands[0].isTemp();
1211 /* this check is needed when moving uniform loop counters out of a divergent loop */
1212 if (all_same_temp)
1213 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1214 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1215 if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId())
1216 all_same_temp = false;
1217 }
1218 if (all_same_temp) {
1219 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1220 } else {
1221 bool all_undef = instr->operands[0].isUndefined();
1222 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1223 if (!instr->operands[i].isUndefined())
1224 all_undef = false;
1225 }
1226 if (all_undef)
1227 ctx.info[instr->definitions[0].tempId()].set_undefined();
1228 }
1229 break;
1230 }
1231 case aco_opcode::v_add_u32:
1232 case aco_opcode::v_add_co_u32:
1233 case aco_opcode::v_add_co_u32_e64:
1234 case aco_opcode::s_add_i32:
1235 case aco_opcode::s_add_u32:
1236 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1237 break;
1238 case aco_opcode::s_not_b32:
1239 case aco_opcode::s_not_b64:
1240 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1241 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1242 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
1243 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1244 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1245 ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1246 }
1247 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1248 break;
1249 case aco_opcode::s_and_b32:
1250 case aco_opcode::s_and_b64:
1251 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
1252 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1253 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */
1254 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
1255 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp);
1256 break;
1257 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1258 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */
1259 ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1260 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1261 break;
1262 }
1263 }
1264 /* fallthrough */
1265 case aco_opcode::s_or_b32:
1266 case aco_opcode::s_or_b64:
1267 case aco_opcode::s_xor_b32:
1268 case aco_opcode::s_xor_b64:
1269 if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) {
1270 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise());
1271 })) {
1272 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1273 }
1274 /* fallthrough */
1275 case aco_opcode::s_lshl_b32:
1276 case aco_opcode::v_or_b32:
1277 case aco_opcode::v_lshlrev_b32:
1278 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1279 break;
1280 case aco_opcode::v_min_f32:
1281 case aco_opcode::v_min_f16:
1282 case aco_opcode::v_min_u32:
1283 case aco_opcode::v_min_i32:
1284 case aco_opcode::v_min_u16:
1285 case aco_opcode::v_min_i16:
1286 case aco_opcode::v_max_f32:
1287 case aco_opcode::v_max_f16:
1288 case aco_opcode::v_max_u32:
1289 case aco_opcode::v_max_i32:
1290 case aco_opcode::v_max_u16:
1291 case aco_opcode::v_max_i16:
1292 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
1293 break;
1294 case aco_opcode::v_cmp_lt_f32:
1295 case aco_opcode::v_cmp_eq_f32:
1296 case aco_opcode::v_cmp_le_f32:
1297 case aco_opcode::v_cmp_gt_f32:
1298 case aco_opcode::v_cmp_lg_f32:
1299 case aco_opcode::v_cmp_ge_f32:
1300 case aco_opcode::v_cmp_o_f32:
1301 case aco_opcode::v_cmp_u_f32:
1302 case aco_opcode::v_cmp_nge_f32:
1303 case aco_opcode::v_cmp_nlg_f32:
1304 case aco_opcode::v_cmp_ngt_f32:
1305 case aco_opcode::v_cmp_nle_f32:
1306 case aco_opcode::v_cmp_neq_f32:
1307 case aco_opcode::v_cmp_nlt_f32:
1308 ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get());
1309 break;
1310 case aco_opcode::s_cselect_b64:
1311 case aco_opcode::s_cselect_b32:
1312 if (instr->operands[0].constantEquals((unsigned) -1) &&
1313 instr->operands[1].constantEquals(0)) {
1314 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
1315 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
1316 }
1317 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
1318 /* Flip the operands to get rid of the scc_invert instruction */
1319 std::swap(instr->operands[0], instr->operands[1]);
1320 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
1321 }
1322 break;
1323 case aco_opcode::p_wqm:
1324 if (instr->operands[0].isTemp() &&
1325 ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1326 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1327 }
1328 break;
1329 default:
1330 break;
1331 }
1332 }
1333
1334 ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse)
1335 {
1336 *ordered = *unordered = op;
1337 switch (op) {
1338 #define CMP(ord, unord) \
1339 case aco_opcode::v_cmp_##ord##_f32:\
1340 case aco_opcode::v_cmp_n##unord##_f32:\
1341 *ordered = aco_opcode::v_cmp_##ord##_f32;\
1342 *unordered = aco_opcode::v_cmp_n##unord##_f32;\
1343 *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\
1344 return true;
1345 CMP(lt, /*n*/ge)
1346 CMP(eq, /*n*/lg)
1347 CMP(le, /*n*/gt)
1348 CMP(gt, /*n*/le)
1349 CMP(lg, /*n*/eq)
1350 CMP(ge, /*n*/lt)
1351 #undef CMP
1352 default:
1353 return false;
1354 }
1355 }
1356
1357 aco_opcode get_ordered(aco_opcode op)
1358 {
1359 aco_opcode ordered, unordered, inverse;
1360 return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::num_opcodes;
1361 }
1362
1363 aco_opcode get_unordered(aco_opcode op)
1364 {
1365 aco_opcode ordered, unordered, inverse;
1366 return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::num_opcodes;
1367 }
1368
1369 aco_opcode get_inverse(aco_opcode op)
1370 {
1371 aco_opcode ordered, unordered, inverse;
1372 return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::num_opcodes;
1373 }
1374
1375 bool is_cmp(aco_opcode op)
1376 {
1377 aco_opcode ordered, unordered, inverse;
1378 return get_cmp_info(op, &ordered, &unordered, &inverse);
1379 }
1380
1381 unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
1382 {
1383 if (ctx.info[tmp.id()].is_temp())
1384 return ctx.info[tmp.id()].temp.id();
1385 else
1386 return tmp.id();
1387 }
1388
1389 void decrease_uses(opt_ctx &ctx, Instruction* instr)
1390 {
1391 if (!--ctx.uses[instr->definitions[0].tempId()]) {
1392 for (const Operand& op : instr->operands) {
1393 if (op.isTemp())
1394 ctx.uses[op.tempId()]--;
1395 }
1396 }
1397 }
1398
1399 Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
1400 {
1401 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_labels))
1402 return nullptr;
1403 if (!ignore_uses && ctx.uses[op.tempId()] > 1)
1404 return nullptr;
1405
1406 Instruction *instr = ctx.info[op.tempId()].instr;
1407
1408 if (instr->definitions.size() == 2) {
1409 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
1410 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1411 return nullptr;
1412 }
1413
1414 return instr;
1415 }
1416
1417 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
1418 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
1419 bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1420 {
1421 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1422 return false;
1423 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1424 return false;
1425
1426 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1427
1428 bool neg[2] = {false, false};
1429 bool abs[2] = {false, false};
1430 uint8_t opsel = 0;
1431 Instruction *op_instr[2];
1432 Temp op[2];
1433
1434 for (unsigned i = 0; i < 2; i++) {
1435 op_instr[i] = follow_operand(ctx, instr->operands[i], true);
1436 if (!op_instr[i])
1437 return false;
1438
1439 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1440
1441 if (op_instr[i]->opcode != expected_cmp)
1442 return false;
1443 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
1444 return false;
1445
1446 if (op_instr[i]->isVOP3()) {
1447 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(op_instr[i]);
1448 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1449 return false;
1450 neg[i] = vop3->neg[0];
1451 abs[i] = vop3->abs[0];
1452 opsel |= (vop3->opsel & 1) << i;
1453 }
1454
1455 Temp op0 = op_instr[i]->operands[0].getTemp();
1456 Temp op1 = op_instr[i]->operands[1].getTemp();
1457 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
1458 return false;
1459
1460 op[i] = op1;
1461 }
1462
1463 if (op[1].type() == RegType::sgpr)
1464 std::swap(op[0], op[1]);
1465 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
1466 if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
1467 return false;
1468
1469 ctx.uses[op[0].id()]++;
1470 ctx.uses[op[1].id()]++;
1471 decrease_uses(ctx, op_instr[0]);
1472 decrease_uses(ctx, op_instr[1]);
1473
1474 aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1475 Instruction *new_instr;
1476 if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
1477 VOP3A_instruction *vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1478 for (unsigned i = 0; i < 2; i++) {
1479 vop3->neg[i] = neg[i];
1480 vop3->abs[i] = abs[i];
1481 }
1482 vop3->opsel = opsel;
1483 new_instr = static_cast<Instruction *>(vop3);
1484 } else {
1485 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1486 }
1487 new_instr->operands[0] = Operand(op[0]);
1488 new_instr->operands[1] = Operand(op[1]);
1489 new_instr->definitions[0] = instr->definitions[0];
1490
1491 ctx.info[instr->definitions[0].tempId()].label = 0;
1492 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1493
1494 instr.reset(new_instr);
1495
1496 return true;
1497 }
1498
1499 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
1500 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
1501 bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1502 {
1503 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1504 return false;
1505 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1506 return false;
1507
1508 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1509 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1510
1511 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1512 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1513 if (!nan_test || !cmp)
1514 return false;
1515
1516 if (cmp->opcode == expected_nan_test)
1517 std::swap(nan_test, cmp);
1518 else if (nan_test->opcode != expected_nan_test)
1519 return false;
1520
1521 if (!is_cmp(cmp->opcode))
1522 return false;
1523
1524 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1525 return false;
1526 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
1527 return false;
1528
1529 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
1530 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
1531 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1532 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1533 if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
1534 return false;
1535 if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
1536 return false;
1537
1538 ctx.uses[cmp->operands[0].tempId()]++;
1539 ctx.uses[cmp->operands[1].tempId()]++;
1540 decrease_uses(ctx, nan_test);
1541 decrease_uses(ctx, cmp);
1542
1543 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1544 Instruction *new_instr;
1545 if (cmp->isVOP3()) {
1546 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1547 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1548 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1549 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1550 new_vop3->clamp = cmp_vop3->clamp;
1551 new_vop3->omod = cmp_vop3->omod;
1552 new_vop3->opsel = cmp_vop3->opsel;
1553 new_instr = new_vop3;
1554 } else {
1555 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1556 }
1557 new_instr->operands[0] = cmp->operands[0];
1558 new_instr->operands[1] = cmp->operands[1];
1559 new_instr->definitions[0] = instr->definitions[0];
1560
1561 ctx.info[instr->definitions[0].tempId()].label = 0;
1562 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1563
1564 instr.reset(new_instr);
1565
1566 return true;
1567 }
1568
1569 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
1570 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
1571 bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1572 {
1573 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1574 return false;
1575 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1576 return false;
1577
1578 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1579
1580 Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
1581 Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
1582
1583 if (!nan_test || !cmp)
1584 return false;
1585
1586 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1587 if (cmp->opcode == expected_nan_test)
1588 std::swap(nan_test, cmp);
1589 else if (nan_test->opcode != expected_nan_test)
1590 return false;
1591
1592 if (!is_cmp(cmp->opcode))
1593 return false;
1594
1595 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
1596 return false;
1597 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
1598 return false;
1599
1600 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
1601 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
1602 if (prop_nan0 != prop_nan1)
1603 return false;
1604
1605 if (nan_test->isVOP3()) {
1606 VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(nan_test);
1607 if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
1608 return false;
1609 }
1610
1611 int constant_operand = -1;
1612 for (unsigned i = 0; i < 2; i++) {
1613 if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
1614 constant_operand = !i;
1615 break;
1616 }
1617 }
1618 if (constant_operand == -1)
1619 return false;
1620
1621 uint32_t constant;
1622 if (cmp->operands[constant_operand].isConstant()) {
1623 constant = cmp->operands[constant_operand].constantValue();
1624 } else if (cmp->operands[constant_operand].isTemp()) {
1625 Temp tmp = cmp->operands[constant_operand].getTemp();
1626 unsigned id = original_temp_id(ctx, tmp);
1627 if (!ctx.info[id].is_constant() && !ctx.info[id].is_literal())
1628 return false;
1629 constant = ctx.info[id].val;
1630 } else {
1631 return false;
1632 }
1633
1634 float constantf;
1635 memcpy(&constantf, &constant, 4);
1636 if (isnan(constantf))
1637 return false;
1638
1639 if (cmp->operands[0].isTemp())
1640 ctx.uses[cmp->operands[0].tempId()]++;
1641 if (cmp->operands[1].isTemp())
1642 ctx.uses[cmp->operands[1].tempId()]++;
1643 decrease_uses(ctx, nan_test);
1644 decrease_uses(ctx, cmp);
1645
1646 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
1647 Instruction *new_instr;
1648 if (cmp->isVOP3()) {
1649 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1650 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1651 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1652 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1653 new_vop3->clamp = cmp_vop3->clamp;
1654 new_vop3->omod = cmp_vop3->omod;
1655 new_vop3->opsel = cmp_vop3->opsel;
1656 new_instr = new_vop3;
1657 } else {
1658 new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1659 }
1660 new_instr->operands[0] = cmp->operands[0];
1661 new_instr->operands[1] = cmp->operands[1];
1662 new_instr->definitions[0] = instr->definitions[0];
1663
1664 ctx.info[instr->definitions[0].tempId()].label = 0;
1665 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1666
1667 instr.reset(new_instr);
1668
1669 return true;
1670 }
1671
1672 /* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */
1673 bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
1674 {
1675 if (instr->opcode != aco_opcode::s_not_b64)
1676 return false;
1677 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1678 return false;
1679 if (!instr->operands[0].isTemp())
1680 return false;
1681
1682 Instruction *cmp = follow_operand(ctx, instr->operands[0]);
1683 if (!cmp)
1684 return false;
1685
1686 aco_opcode new_opcode = get_inverse(cmp->opcode);
1687 if (new_opcode == aco_opcode::num_opcodes)
1688 return false;
1689
1690 if (cmp->operands[0].isTemp())
1691 ctx.uses[cmp->operands[0].tempId()]++;
1692 if (cmp->operands[1].isTemp())
1693 ctx.uses[cmp->operands[1].tempId()]++;
1694 decrease_uses(ctx, cmp);
1695
1696 Instruction *new_instr;
1697 if (cmp->isVOP3()) {
1698 VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
1699 VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
1700 memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
1701 memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
1702 new_vop3->clamp = cmp_vop3->clamp;
1703 new_vop3->omod = cmp_vop3->omod;
1704 new_vop3->opsel = cmp_vop3->opsel;
1705 new_instr = new_vop3;
1706 } else {
1707 new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
1708 }
1709 new_instr->operands[0] = cmp->operands[0];
1710 new_instr->operands[1] = cmp->operands[1];
1711 new_instr->definitions[0] = instr->definitions[0];
1712
1713 ctx.info[instr->definitions[0].tempId()].label = 0;
1714 ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr);
1715
1716 instr.reset(new_instr);
1717
1718 return true;
1719 }
1720
1721 /* op1(op2(1, 2), 0) if swap = false
1722 * op1(0, op2(1, 2)) if swap = true */
1723 bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
1724 Instruction* op1_instr, bool swap, const char *shuffle_str,
1725 Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel,
1726 bool *op1_clamp, uint8_t *op1_omod,
1727 bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel)
1728 {
1729 /* checks */
1730 if (op1_instr->opcode != op1)
1731 return false;
1732
1733 Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
1734 if (!op2_instr || op2_instr->opcode != op2)
1735 return false;
1736 if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
1737 return false;
1738
1739 VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op1_instr) : NULL;
1740 VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op2_instr) : NULL;
1741
1742 /* don't support inbetween clamp/omod */
1743 if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
1744 return false;
1745
1746 /* get operands and modifiers and check inbetween modifiers */
1747 *op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
1748 *op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
1749
1750 if (inbetween_neg)
1751 *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
1752 else if (op1_vop3 && op1_vop3->neg[swap])
1753 return false;
1754
1755 if (inbetween_abs)
1756 *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
1757 else if (op1_vop3 && op1_vop3->abs[swap])
1758 return false;
1759
1760 if (inbetween_opsel)
1761 *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << swap) : false;
1762 else if (op1_vop3 && op1_vop3->opsel & (1 << swap))
1763 return false;
1764
1765 int shuffle[3];
1766 shuffle[shuffle_str[0] - '0'] = 0;
1767 shuffle[shuffle_str[1] - '0'] = 1;
1768 shuffle[shuffle_str[2] - '0'] = 2;
1769
1770 operands[shuffle[0]] = op1_instr->operands[!swap];
1771 neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
1772 abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
1773 if (op1_vop3 && op1_vop3->opsel & (1 << !swap))
1774 *opsel |= 1 << shuffle[0];
1775
1776 for (unsigned i = 0; i < 2; i++) {
1777 operands[shuffle[i + 1]] = op2_instr->operands[i];
1778 neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
1779 abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
1780 if (op2_vop3 && op2_vop3->opsel & (1 << i))
1781 *opsel |= 1 << shuffle[i + 1];
1782 }
1783
1784 /* check operands */
1785 if (!check_vop3_operands(ctx, 3, operands))
1786 return false;
1787
1788 return true;
1789 }
1790
1791 void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
1792 Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel,
1793 bool clamp, unsigned omod)
1794 {
1795 VOP3A_instruction *new_instr = create_instruction<VOP3A_instruction>(opcode, Format::VOP3A, 3, 1);
1796 memcpy(new_instr->abs, abs, sizeof(bool[3]));
1797 memcpy(new_instr->neg, neg, sizeof(bool[3]));
1798 new_instr->clamp = clamp;
1799 new_instr->omod = omod;
1800 new_instr->opsel = opsel;
1801 new_instr->operands[0] = operands[0];
1802 new_instr->operands[1] = operands[1];
1803 new_instr->operands[2] = operands[2];
1804 new_instr->definitions[0] = instr->definitions[0];
1805 ctx.info[instr->definitions[0].tempId()].label = 0;
1806
1807 instr.reset(new_instr);
1808 }
1809
1810 bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops)
1811 {
1812 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
1813 (label_omod_success | label_clamp_success);
1814
1815 for (unsigned swap = 0; swap < 2; swap++) {
1816 if (!((1 << swap) & ops))
1817 continue;
1818
1819 Operand operands[3];
1820 bool neg[3], abs[3], clamp;
1821 uint8_t opsel = 0, omod = 0;
1822 if (match_op3_for_vop3(ctx, instr->opcode, op2,
1823 instr.get(), swap, shuffle,
1824 operands, neg, abs, &opsel,
1825 &clamp, &omod, NULL, NULL, NULL)) {
1826 ctx.uses[instr->operands[swap].tempId()]--;
1827 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
1828 if (omod_clamp & label_omod_success)
1829 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
1830 if (omod_clamp & label_clamp_success)
1831 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
1832 return true;
1833 }
1834 }
1835 return false;
1836 }
1837
1838 bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
1839 {
1840 if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
1841 return true;
1842
1843 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
1844 (label_omod_success | label_clamp_success);
1845
1846 /* min(-max(a, b), c) -> min3(-a, -b, c) *
1847 * max(-min(a, b), c) -> max3(-a, -b, c) */
1848 for (unsigned swap = 0; swap < 2; swap++) {
1849 Operand operands[3];
1850 bool neg[3], abs[3], clamp;
1851 uint8_t opsel = 0, omod = 0;
1852 bool inbetween_neg;
1853 if (match_op3_for_vop3(ctx, instr->opcode, opposite,
1854 instr.get(), swap, "012",
1855 operands, neg, abs, &opsel,
1856 &clamp, &omod, &inbetween_neg, NULL, NULL) &&
1857 inbetween_neg) {
1858 ctx.uses[instr->operands[swap].tempId()]--;
1859 neg[1] = true;
1860 neg[2] = true;
1861 create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
1862 if (omod_clamp & label_omod_success)
1863 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
1864 if (omod_clamp & label_clamp_success)
1865 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
1866 return true;
1867 }
1868 }
1869 return false;
1870 }
1871
1872 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
1873 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
1874 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
1875 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
1876 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
1877 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
1878 bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1879 {
1880 /* checks */
1881 if (!instr->operands[0].isTemp())
1882 return false;
1883 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1884 return false;
1885
1886 Instruction *op2_instr = follow_operand(ctx, instr->operands[0]);
1887 if (!op2_instr)
1888 return false;
1889 switch (op2_instr->opcode) {
1890 case aco_opcode::s_and_b32:
1891 case aco_opcode::s_or_b32:
1892 case aco_opcode::s_xor_b32:
1893 case aco_opcode::s_and_b64:
1894 case aco_opcode::s_or_b64:
1895 case aco_opcode::s_xor_b64:
1896 break;
1897 default:
1898 return false;
1899 }
1900
1901 /* create instruction */
1902 std::swap(instr->definitions[0], op2_instr->definitions[0]);
1903 std::swap(instr->definitions[1], op2_instr->definitions[1]);
1904 ctx.uses[instr->operands[0].tempId()]--;
1905 ctx.info[op2_instr->definitions[0].tempId()].label = 0;
1906
1907 switch (op2_instr->opcode) {
1908 case aco_opcode::s_and_b32:
1909 op2_instr->opcode = aco_opcode::s_nand_b32;
1910 break;
1911 case aco_opcode::s_or_b32:
1912 op2_instr->opcode = aco_opcode::s_nor_b32;
1913 break;
1914 case aco_opcode::s_xor_b32:
1915 op2_instr->opcode = aco_opcode::s_xnor_b32;
1916 break;
1917 case aco_opcode::s_and_b64:
1918 op2_instr->opcode = aco_opcode::s_nand_b64;
1919 break;
1920 case aco_opcode::s_or_b64:
1921 op2_instr->opcode = aco_opcode::s_nor_b64;
1922 break;
1923 case aco_opcode::s_xor_b64:
1924 op2_instr->opcode = aco_opcode::s_xnor_b64;
1925 break;
1926 default:
1927 break;
1928 }
1929
1930 return true;
1931 }
1932
1933 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
1934 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
1935 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
1936 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
1937 bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1938 {
1939 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
1940 return false;
1941
1942 for (unsigned i = 0; i < 2; i++) {
1943 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
1944 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64))
1945 continue;
1946 if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
1947 continue;
1948
1949 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
1950 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
1951 continue;
1952
1953 ctx.uses[instr->operands[i].tempId()]--;
1954 instr->operands[0] = instr->operands[!i];
1955 instr->operands[1] = op2_instr->operands[0];
1956 ctx.info[instr->definitions[0].tempId()].label = 0;
1957
1958 switch (instr->opcode) {
1959 case aco_opcode::s_and_b32:
1960 instr->opcode = aco_opcode::s_andn2_b32;
1961 break;
1962 case aco_opcode::s_or_b32:
1963 instr->opcode = aco_opcode::s_orn2_b32;
1964 break;
1965 case aco_opcode::s_and_b64:
1966 instr->opcode = aco_opcode::s_andn2_b64;
1967 break;
1968 case aco_opcode::s_or_b64:
1969 instr->opcode = aco_opcode::s_orn2_b64;
1970 break;
1971 default:
1972 break;
1973 }
1974
1975 return true;
1976 }
1977 return false;
1978 }
1979
1980 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
1981 bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1982 {
1983 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
1984 return false;
1985
1986 for (unsigned i = 0; i < 2; i++) {
1987 Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
1988 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
1989 ctx.uses[op2_instr->definitions[1].tempId()])
1990 continue;
1991 if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
1992 continue;
1993
1994 uint32_t shift = op2_instr->operands[1].constantValue();
1995 if (shift < 1 || shift > 4)
1996 continue;
1997
1998 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
1999 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2000 continue;
2001
2002 ctx.uses[instr->operands[i].tempId()]--;
2003 instr->operands[1] = instr->operands[!i];
2004 instr->operands[0] = op2_instr->operands[0];
2005 ctx.info[instr->definitions[0].tempId()].label = 0;
2006
2007 instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32,
2008 aco_opcode::s_lshl2_add_u32,
2009 aco_opcode::s_lshl3_add_u32,
2010 aco_opcode::s_lshl4_add_u32})[shift - 1];
2011
2012 return true;
2013 }
2014 return false;
2015 }
2016
2017 bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
2018 {
2019 if (instr->usesModifiers())
2020 return false;
2021
2022 for (unsigned i = 0; i < 2; i++) {
2023 if (!((1 << i) & ops))
2024 continue;
2025 if (instr->operands[i].isTemp() &&
2026 ctx.info[instr->operands[i].tempId()].is_b2i() &&
2027 ctx.uses[instr->operands[i].tempId()] == 1) {
2028
2029 aco_ptr<Instruction> new_instr;
2030 if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2031 new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
2032 } else if (ctx.program->chip_class >= GFX10 ||
2033 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
2034 new_instr.reset(create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
2035 } else {
2036 return false;
2037 }
2038 ctx.uses[instr->operands[i].tempId()]--;
2039 new_instr->definitions[0] = instr->definitions[0];
2040 new_instr->definitions[1] = instr->definitions.size() == 2 ? instr->definitions[1] :
2041 Definition(ctx.program->allocateId(), ctx.program->lane_mask);
2042 new_instr->definitions[1].setHint(vcc);
2043 new_instr->operands[0] = Operand(0u);
2044 new_instr->operands[1] = instr->operands[!i];
2045 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2046 instr = std::move(new_instr);
2047 ctx.info[instr->definitions[0].tempId()].label = 0;
2048 return true;
2049 }
2050 }
2051
2052 return false;
2053 }
2054
2055 bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
2056 {
2057 switch (op) {
2058 #define MINMAX(type, gfx9) \
2059 case aco_opcode::v_min_##type:\
2060 case aco_opcode::v_max_##type:\
2061 case aco_opcode::v_med3_##type:\
2062 *min = aco_opcode::v_min_##type;\
2063 *max = aco_opcode::v_max_##type;\
2064 *med3 = aco_opcode::v_med3_##type;\
2065 *min3 = aco_opcode::v_min3_##type;\
2066 *max3 = aco_opcode::v_max3_##type;\
2067 *some_gfx9_only = gfx9;\
2068 return true;
2069 MINMAX(f32, false)
2070 MINMAX(u32, false)
2071 MINMAX(i32, false)
2072 MINMAX(f16, true)
2073 MINMAX(u16, true)
2074 MINMAX(i16, true)
2075 #undef MINMAX
2076 default:
2077 return false;
2078 }
2079 }
2080
2081 /* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb
2082 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */
2083 bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
2084 aco_opcode min, aco_opcode max, aco_opcode med)
2085 {
2086 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
2087 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
2088 * minVal > maxVal, which means we can always select it to a v_med3_f32 */
2089 aco_opcode other_op;
2090 if (instr->opcode == min)
2091 other_op = max;
2092 else if (instr->opcode == max)
2093 other_op = min;
2094 else
2095 return false;
2096
2097 uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label &
2098 (label_omod_success | label_clamp_success);
2099
2100 for (unsigned swap = 0; swap < 2; swap++) {
2101 Operand operands[3];
2102 bool neg[3], abs[3], clamp;
2103 uint8_t opsel = 0, omod = 0;
2104 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap,
2105 "012", operands, neg, abs, &opsel,
2106 &clamp, &omod, NULL, NULL, NULL)) {
2107 int const0_idx = -1, const1_idx = -1;
2108 uint32_t const0 = 0, const1 = 0;
2109 for (int i = 0; i < 3; i++) {
2110 uint32_t val;
2111 if (operands[i].isConstant()) {
2112 val = operands[i].constantValue();
2113 } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal()) {
2114 val = ctx.info[operands[i].tempId()].val;
2115 } else {
2116 continue;
2117 }
2118 if (const0_idx >= 0) {
2119 const1_idx = i;
2120 const1 = val;
2121 } else {
2122 const0_idx = i;
2123 const0 = val;
2124 }
2125 }
2126 if (const0_idx < 0 || const1_idx < 0)
2127 continue;
2128
2129 if (opsel & (1 << const0_idx))
2130 const0 >>= 16;
2131 if (opsel & (1 << const1_idx))
2132 const1 >>= 16;
2133
2134 int lower_idx = const0_idx;
2135 switch (min) {
2136 case aco_opcode::v_min_f32:
2137 case aco_opcode::v_min_f16: {
2138 float const0_f, const1_f;
2139 if (min == aco_opcode::v_min_f32) {
2140 memcpy(&const0_f, &const0, 4);
2141 memcpy(&const1_f, &const1, 4);
2142 } else {
2143 const0_f = _mesa_half_to_float(const0);
2144 const1_f = _mesa_half_to_float(const1);
2145 }
2146 if (abs[const0_idx]) const0_f = fabsf(const0_f);
2147 if (abs[const1_idx]) const1_f = fabsf(const1_f);
2148 if (neg[const0_idx]) const0_f = -const0_f;
2149 if (neg[const1_idx]) const1_f = -const1_f;
2150 lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
2151 break;
2152 }
2153 case aco_opcode::v_min_u32: {
2154 lower_idx = const0 < const1 ? const0_idx : const1_idx;
2155 break;
2156 }
2157 case aco_opcode::v_min_u16: {
2158 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
2159 break;
2160 }
2161 case aco_opcode::v_min_i32: {
2162 int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
2163 int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
2164 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2165 break;
2166 }
2167 case aco_opcode::v_min_i16: {
2168 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
2169 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
2170 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2171 break;
2172 }
2173 default:
2174 break;
2175 }
2176 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
2177
2178 if (instr->opcode == min) {
2179 if (upper_idx != 0 || lower_idx == 0)
2180 return false;
2181 } else {
2182 if (upper_idx == 0 || lower_idx != 0)
2183 return false;
2184 }
2185
2186 ctx.uses[instr->operands[swap].tempId()]--;
2187 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
2188 if (omod_clamp & label_omod_success)
2189 ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
2190 if (omod_clamp & label_clamp_success)
2191 ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get());
2192
2193 return true;
2194 }
2195 }
2196
2197 return false;
2198 }
2199
2200
2201 void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2202 {
2203 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2204 instr->opcode == aco_opcode::v_lshrrev_b64 ||
2205 instr->opcode == aco_opcode::v_ashrrev_i64;
2206
2207 /* find candidates and create the set of sgprs already read */
2208 unsigned sgpr_ids[2] = {0, 0};
2209 uint32_t operand_mask = 0;
2210 bool has_literal = false;
2211 for (unsigned i = 0; i < instr->operands.size(); i++) {
2212 if (instr->operands[i].isLiteral())
2213 has_literal = true;
2214 if (!instr->operands[i].isTemp())
2215 continue;
2216 if (instr->operands[i].getTemp().type() == RegType::sgpr) {
2217 if (instr->operands[i].tempId() != sgpr_ids[0])
2218 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
2219 }
2220 ssa_info& info = ctx.info[instr->operands[i].tempId()];
2221 if (info.is_temp() && info.temp.type() == RegType::sgpr)
2222 operand_mask |= 1u << i;
2223 }
2224 unsigned max_sgprs = 1;
2225 if (ctx.program->chip_class >= GFX10 && !is_shift64)
2226 max_sgprs = 2;
2227 if (has_literal)
2228 max_sgprs--;
2229
2230 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2231
2232 /* keep on applying sgprs until there is nothing left to be done */
2233 while (operand_mask) {
2234 uint32_t sgpr_idx = 0;
2235 uint32_t sgpr_info_id = 0;
2236 uint32_t mask = operand_mask;
2237 /* choose a sgpr */
2238 while (mask) {
2239 unsigned i = u_bit_scan(&mask);
2240 uint16_t uses = ctx.uses[instr->operands[i].tempId()];
2241 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
2242 sgpr_idx = i;
2243 sgpr_info_id = instr->operands[i].tempId();
2244 }
2245 }
2246 operand_mask &= ~(1u << sgpr_idx);
2247
2248 /* Applying two sgprs require making it VOP3, so don't do it unless it's
2249 * definitively beneficial.
2250 * TODO: this is too conservative because later the use count could be reduced to 1 */
2251 if (num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3())
2252 break;
2253
2254 Temp sgpr = ctx.info[sgpr_info_id].temp;
2255 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
2256 if (new_sgpr && num_sgprs >= max_sgprs)
2257 continue;
2258
2259 if (sgpr_idx == 0 || instr->isVOP3()) {
2260 instr->operands[sgpr_idx] = Operand(sgpr);
2261 } else if (can_swap_operands(instr)) {
2262 instr->operands[sgpr_idx] = instr->operands[0];
2263 instr->operands[0] = Operand(sgpr);
2264 /* swap bits using a 4-entry LUT */
2265 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
2266 operand_mask = (operand_mask & ~0x3) | swapped;
2267 } else if (can_use_VOP3(ctx, instr)) {
2268 to_VOP3(ctx, instr);
2269 instr->operands[sgpr_idx] = Operand(sgpr);
2270 } else {
2271 continue;
2272 }
2273
2274 if (new_sgpr)
2275 sgpr_ids[num_sgprs++] = sgpr.id();
2276 ctx.uses[sgpr_info_id]--;
2277 ctx.uses[sgpr.id()]++;
2278 }
2279 }
2280
2281 bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2282 {
2283 /* check if we could apply omod on predecessor */
2284 if (instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16) {
2285 bool op0 = instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_omod_success();
2286 bool op1 = instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_omod_success();
2287 if (op0 || op1) {
2288 unsigned idx = op0 ? 0 : 1;
2289 /* omod was successfully applied */
2290 /* if the omod instruction is v_mad, we also have to change the original add */
2291 if (ctx.info[instr->operands[idx].tempId()].is_mad()) {
2292 Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get();
2293 if (ctx.info[instr->definitions[0].tempId()].is_clamp())
2294 static_cast<VOP3A_instruction*>(add_instr)->clamp = true;
2295 add_instr->definitions[0] = instr->definitions[0];
2296 }
2297
2298 Instruction* omod_instr = ctx.info[instr->operands[idx].tempId()].instr;
2299 /* check if we have an additional clamp modifier */
2300 if (ctx.info[instr->definitions[0].tempId()].is_clamp() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
2301 ctx.uses[ctx.info[instr->definitions[0].tempId()].temp.id()]) {
2302 static_cast<VOP3A_instruction*>(omod_instr)->clamp = true;
2303 ctx.info[instr->definitions[0].tempId()].set_clamp_success(omod_instr);
2304 }
2305 /* change definition ssa-id of modified instruction */
2306 omod_instr->definitions[0] = instr->definitions[0];
2307
2308 /* change the definition of instr to something unused, e.g. the original omod def */
2309 instr->definitions[0] = Definition(instr->operands[idx].getTemp());
2310 ctx.uses[instr->definitions[0].tempId()] = 0;
2311 return true;
2312 }
2313 if (!ctx.info[instr->definitions[0].tempId()].label) {
2314 /* in all other cases, label this instruction as option for multiply-add */
2315 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
2316 }
2317 }
2318
2319 /* check if we could apply clamp on predecessor */
2320 if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
2321 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
2322 unsigned idx = 0;
2323 bool found_zero = false, found_one = false;
2324 for (unsigned i = 0; i < 3; i++)
2325 {
2326 if (instr->operands[i].constantEquals(0))
2327 found_zero = true;
2328 else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
2329 found_one = true;
2330 else
2331 idx = i;
2332 }
2333 if (found_zero && found_one && instr->operands[idx].isTemp() &&
2334 ctx.info[instr->operands[idx].tempId()].is_clamp_success()) {
2335 /* clamp was successfully applied */
2336 /* if the clamp instruction is v_mad, we also have to change the original add */
2337 if (ctx.info[instr->operands[idx].tempId()].is_mad()) {
2338 Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get();
2339 add_instr->definitions[0] = instr->definitions[0];
2340 }
2341 Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr;
2342 /* change definition ssa-id of modified instruction */
2343 clamp_instr->definitions[0] = instr->definitions[0];
2344
2345 /* change the definition of instr to something unused, e.g. the original omod def */
2346 instr->definitions[0] = Definition(instr->operands[idx].getTemp());
2347 ctx.uses[instr->definitions[0].tempId()] = 0;
2348 return true;
2349 }
2350 }
2351
2352 /* omod has no effect if denormals are enabled */
2353 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
2354 if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
2355 can_use_VOP3(ctx, instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) {
2356 bool can_use_omod = (instr->definitions[0].bytes() == 4 ? block.fp_mode.denorm32 : block.fp_mode.denorm16_64) == 0;
2357 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
2358 if (can_use_omod && def_info.is_omod2() && ctx.uses[def_info.temp.id()]) {
2359 to_VOP3(ctx, instr);
2360 static_cast<VOP3A_instruction*>(instr.get())->omod = 1;
2361 def_info.set_omod_success(instr.get());
2362 } else if (can_use_omod && def_info.is_omod4() && ctx.uses[def_info.temp.id()]) {
2363 to_VOP3(ctx, instr);
2364 static_cast<VOP3A_instruction*>(instr.get())->omod = 2;
2365 def_info.set_omod_success(instr.get());
2366 } else if (can_use_omod && def_info.is_omod5() && ctx.uses[def_info.temp.id()]) {
2367 to_VOP3(ctx, instr);
2368 static_cast<VOP3A_instruction*>(instr.get())->omod = 3;
2369 def_info.set_omod_success(instr.get());
2370 } else if (def_info.is_clamp() && ctx.uses[def_info.temp.id()]) {
2371 to_VOP3(ctx, instr);
2372 static_cast<VOP3A_instruction*>(instr.get())->clamp = true;
2373 def_info.set_clamp_success(instr.get());
2374 }
2375 }
2376
2377 return false;
2378 }
2379
2380 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
2381 // this would mean that we'd have to fix the instruction uses while value propagation
2382
2383 void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
2384 {
2385 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
2386 return;
2387
2388 if (instr->isVALU()) {
2389 if (can_apply_sgprs(instr))
2390 apply_sgprs(ctx, instr);
2391 if (apply_omod_clamp(ctx, block, instr))
2392 return;
2393 }
2394
2395 if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
2396 instr->definitions[0].setHint(vcc);
2397 }
2398
2399 /* TODO: There are still some peephole optimizations that could be done:
2400 * - abs(a - b) -> s_absdiff_i32
2401 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
2402 * - patterns for v_alignbit_b32 and v_alignbyte_b32
2403 * These aren't probably too interesting though.
2404 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
2405 * probably more useful than the previously mentioned optimizations.
2406 * The various comparison optimizations also currently only work with 32-bit
2407 * floats. */
2408
2409 /* neg(mul(a, b)) -> mul(neg(a), b) */
2410 if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) {
2411 Temp val = ctx.info[instr->definitions[0].tempId()].temp;
2412
2413 if (!ctx.info[val.id()].is_mul())
2414 return;
2415
2416 Instruction* mul_instr = ctx.info[val.id()].instr;
2417
2418 if (mul_instr->operands[0].isLiteral())
2419 return;
2420 if (mul_instr->isVOP3() && static_cast<VOP3A_instruction*>(mul_instr)->clamp)
2421 return;
2422
2423 /* convert to mul(neg(a), b) */
2424 ctx.uses[mul_instr->definitions[0].tempId()]--;
2425 Definition def = instr->definitions[0];
2426 /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
2427 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
2428 instr.reset(create_instruction<VOP3A_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
2429 instr->operands[0] = mul_instr->operands[0];
2430 instr->operands[1] = mul_instr->operands[1];
2431 instr->definitions[0] = def;
2432 VOP3A_instruction* new_mul = static_cast<VOP3A_instruction*>(instr.get());
2433 if (mul_instr->isVOP3()) {
2434 VOP3A_instruction* mul = static_cast<VOP3A_instruction*>(mul_instr);
2435 new_mul->neg[0] = mul->neg[0] && !is_abs;
2436 new_mul->neg[1] = mul->neg[1] && !is_abs;
2437 new_mul->abs[0] = mul->abs[0] || is_abs;
2438 new_mul->abs[1] = mul->abs[1] || is_abs;
2439 new_mul->omod = mul->omod;
2440 }
2441 new_mul->neg[0] ^= true;
2442 new_mul->clamp = false;
2443
2444 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
2445 return;
2446 }
2447
2448 /* combine mul+add -> mad */
2449 bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
2450 instr->opcode == aco_opcode::v_sub_f32 ||
2451 instr->opcode == aco_opcode::v_subrev_f32;
2452 bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
2453 instr->opcode == aco_opcode::v_sub_f16 ||
2454 instr->opcode == aco_opcode::v_subrev_f16;
2455 if (mad16 || mad32) {
2456 bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 :
2457 (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
2458 if (need_fma && instr->definitions[0].isPrecise())
2459 return;
2460 if (need_fma && mad32 && !ctx.program->has_fast_fma32)
2461 return;
2462
2463 uint32_t uses_src0 = UINT32_MAX;
2464 uint32_t uses_src1 = UINT32_MAX;
2465 Instruction* mul_instr = nullptr;
2466 unsigned add_op_idx;
2467 /* check if any of the operands is a multiplication */
2468 ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL;
2469 ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL;
2470 if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise()))
2471 uses_src0 = ctx.uses[instr->operands[0].tempId()];
2472 if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise()))
2473 uses_src1 = ctx.uses[instr->operands[1].tempId()];
2474
2475 /* find the 'best' mul instruction to combine with the add */
2476 if (uses_src0 < uses_src1) {
2477 mul_instr = op0_info->instr;
2478 add_op_idx = 1;
2479 } else if (uses_src1 < uses_src0) {
2480 mul_instr = op1_info->instr;
2481 add_op_idx = 0;
2482 } else if (uses_src0 != UINT32_MAX) {
2483 /* tiebreaker: quite random what to pick */
2484 if (op0_info->instr->operands[0].isLiteral()) {
2485 mul_instr = op1_info->instr;
2486 add_op_idx = 0;
2487 } else {
2488 mul_instr = op0_info->instr;
2489 add_op_idx = 1;
2490 }
2491 }
2492 if (mul_instr) {
2493 Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)};
2494 bool neg[3] = {false, false, false};
2495 bool abs[3] = {false, false, false};
2496 unsigned omod = 0;
2497 bool clamp = false;
2498 op[0] = mul_instr->operands[0];
2499 op[1] = mul_instr->operands[1];
2500 op[2] = instr->operands[add_op_idx];
2501 // TODO: would be better to check this before selecting a mul instr?
2502 if (!check_vop3_operands(ctx, 3, op))
2503 return;
2504
2505 if (mul_instr->isVOP3()) {
2506 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr);
2507 neg[0] = vop3->neg[0];
2508 neg[1] = vop3->neg[1];
2509 abs[0] = vop3->abs[0];
2510 abs[1] = vop3->abs[1];
2511 /* we cannot use these modifiers between mul and add */
2512 if (vop3->clamp || vop3->omod)
2513 return;
2514 }
2515
2516 /* convert to mad */
2517 ctx.uses[mul_instr->definitions[0].tempId()]--;
2518 if (ctx.uses[mul_instr->definitions[0].tempId()]) {
2519 if (op[0].isTemp())
2520 ctx.uses[op[0].tempId()]++;
2521 if (op[1].isTemp())
2522 ctx.uses[op[1].tempId()]++;
2523 }
2524
2525 if (instr->isVOP3()) {
2526 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (instr.get());
2527 neg[2] = vop3->neg[add_op_idx];
2528 abs[2] = vop3->abs[add_op_idx];
2529 omod = vop3->omod;
2530 clamp = vop3->clamp;
2531 /* abs of the multiplication result */
2532 if (vop3->abs[1 - add_op_idx]) {
2533 neg[0] = false;
2534 neg[1] = false;
2535 abs[0] = true;
2536 abs[1] = true;
2537 }
2538 /* neg of the multiplication result */
2539 neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx];
2540 }
2541 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
2542 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
2543 else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
2544 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
2545
2546 aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
2547 if (mad16)
2548 mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
2549 (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
2550
2551 aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
2552 for (unsigned i = 0; i < 3; i++)
2553 {
2554 mad->operands[i] = op[i];
2555 mad->neg[i] = neg[i];
2556 mad->abs[i] = abs[i];
2557 }
2558 mad->omod = omod;
2559 mad->clamp = clamp;
2560 mad->definitions[0] = instr->definitions[0];
2561
2562 /* mark this ssa_def to be re-checked for profitability and literals */
2563 ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
2564 ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
2565 instr.reset(mad.release());
2566 return;
2567 }
2568 }
2569 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
2570 else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
2571 for (unsigned i = 0; i < 2; i++) {
2572 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
2573 ctx.uses[instr->operands[i].tempId()] == 1 &&
2574 instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
2575 ctx.uses[instr->operands[i].tempId()]--;
2576 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
2577
2578 aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
2579 new_instr->operands[0] = Operand(0u);
2580 new_instr->operands[1] = instr->operands[!i];
2581 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2582 new_instr->definitions[0] = instr->definitions[0];
2583 instr.reset(new_instr.release());
2584 ctx.info[instr->definitions[0].tempId()].label = 0;
2585 return;
2586 }
2587 }
2588 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
2589 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2590 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
2591 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2592 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
2593 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_or_b32, "120", 1 | 2)) ;
2594 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2);
2595 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
2596 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ;
2597 else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
2598 } else if (instr->opcode == aco_opcode::v_add_u32) {
2599 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
2600 else if (ctx.program->chip_class >= GFX9) {
2601 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2602 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
2603 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2604 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2605 else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
2606 else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_add_u32, "120", 1 | 2)) ;
2607 else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2);
2608 }
2609 } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
2610 instr->opcode == aco_opcode::v_add_co_u32_e64) {
2611 combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
2612 } else if (instr->opcode == aco_opcode::v_sub_u32 ||
2613 instr->opcode == aco_opcode::v_sub_co_u32 ||
2614 instr->opcode == aco_opcode::v_sub_co_u32_e64) {
2615 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
2616 } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
2617 instr->opcode == aco_opcode::v_subrev_co_u32 ||
2618 instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
2619 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
2620 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
2621 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
2622 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
2623 combine_salu_lshl_add(ctx, instr);
2624 } else if (instr->opcode == aco_opcode::s_not_b32) {
2625 combine_salu_not_bitwise(ctx, instr);
2626 } else if (instr->opcode == aco_opcode::s_not_b64) {
2627 if (combine_inverse_comparison(ctx, instr)) ;
2628 else combine_salu_not_bitwise(ctx, instr);
2629 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
2630 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
2631 if (combine_ordering_test(ctx, instr)) ;
2632 else if (combine_comparison_ordering(ctx, instr)) ;
2633 else if (combine_constant_comparison_ordering(ctx, instr)) ;
2634 else combine_salu_n2(ctx, instr);
2635 } else {
2636 aco_opcode min, max, min3, max3, med3;
2637 bool some_gfx9_only;
2638 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
2639 (!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
2640 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ;
2641 else combine_clamp(ctx, instr, min, max, med3);
2642 }
2643 }
2644 }
2645
2646 bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
2647 {
2648 switch (instr->opcode) {
2649 case aco_opcode::s_and_b32:
2650 case aco_opcode::s_and_b64:
2651 instr->opcode = aco_opcode::s_and_b32;
2652 break;
2653 case aco_opcode::s_or_b32:
2654 case aco_opcode::s_or_b64:
2655 instr->opcode = aco_opcode::s_or_b32;
2656 break;
2657 case aco_opcode::s_xor_b32:
2658 case aco_opcode::s_xor_b64:
2659 instr->opcode = aco_opcode::s_absdiff_i32;
2660 break;
2661 default:
2662 /* Don't transform other instructions. They are very unlikely to appear here. */
2663 return false;
2664 }
2665
2666 for (Operand &op : instr->operands) {
2667 ctx.uses[op.tempId()]--;
2668
2669 if (ctx.info[op.tempId()].is_uniform_bool()) {
2670 /* Just use the uniform boolean temp. */
2671 op.setTemp(ctx.info[op.tempId()].temp);
2672 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
2673 /* Use the SCC definition of the predecessor instruction.
2674 * This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
2675 * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
2676 */
2677 Instruction *pred_instr = ctx.info[op.tempId()].instr;
2678 assert(pred_instr->definitions.size() >= 2);
2679 assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
2680 op.setTemp(pred_instr->definitions[1].getTemp());
2681 } else {
2682 unreachable("Invalid operand on uniform bitwise instruction.");
2683 }
2684
2685 ctx.uses[op.tempId()]++;
2686 }
2687
2688 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
2689 assert(instr->operands[0].regClass() == s1);
2690 assert(instr->operands[1].regClass() == s1);
2691 return true;
2692 }
2693
2694 void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2695 {
2696 const uint32_t threshold = 4;
2697
2698 if (is_dead(ctx.uses, instr.get())) {
2699 instr.reset();
2700 return;
2701 }
2702
2703 /* convert split_vector into a copy or extract_vector if only one definition is ever used */
2704 if (instr->opcode == aco_opcode::p_split_vector) {
2705 unsigned num_used = 0;
2706 unsigned idx = 0;
2707 unsigned split_offset = 0;
2708 for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) {
2709 if (ctx.uses[instr->definitions[i].tempId()]) {
2710 num_used++;
2711 idx = i;
2712 split_offset = offset;
2713 }
2714 }
2715 bool done = false;
2716 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
2717 ctx.uses[instr->operands[0].tempId()] == 1) {
2718 Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
2719
2720 unsigned off = 0;
2721 Operand op;
2722 for (Operand& vec_op : vec->operands) {
2723 if (off == split_offset) {
2724 op = vec_op;
2725 break;
2726 }
2727 off += vec_op.bytes();
2728 }
2729 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
2730 ctx.uses[instr->operands[0].tempId()]--;
2731 for (Operand& vec_op : vec->operands) {
2732 if (vec_op.isTemp())
2733 ctx.uses[vec_op.tempId()]--;
2734 }
2735 if (op.isTemp())
2736 ctx.uses[op.tempId()]++;
2737
2738 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
2739 extract->operands[0] = op;
2740 extract->definitions[0] = instr->definitions[idx];
2741 instr.reset(extract.release());
2742
2743 done = true;
2744 }
2745 }
2746
2747 if (!done && num_used == 1 &&
2748 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
2749 split_offset % instr->definitions[idx].bytes() == 0) {
2750 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
2751 extract->operands[0] = instr->operands[0];
2752 extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes());
2753 extract->definitions[0] = instr->definitions[idx];
2754 instr.reset(extract.release());
2755 }
2756 }
2757
2758 mad_info* mad_info = NULL;
2759 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
2760 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
2761 /* re-check mad instructions */
2762 if (ctx.uses[mad_info->mul_temp_id]) {
2763 ctx.uses[mad_info->mul_temp_id]++;
2764 if (instr->operands[0].isTemp())
2765 ctx.uses[instr->operands[0].tempId()]--;
2766 if (instr->operands[1].isTemp())
2767 ctx.uses[instr->operands[1].tempId()]--;
2768 instr.swap(mad_info->add_instr);
2769 mad_info = NULL;
2770 }
2771 /* check literals */
2772 else if (!instr->usesModifiers()) {
2773 /* FMA can only take literals on GFX10+ */
2774 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
2775 ctx.program->chip_class < GFX10)
2776 return;
2777
2778 bool sgpr_used = false;
2779 uint32_t literal_idx = 0;
2780 uint32_t literal_uses = UINT32_MAX;
2781 for (unsigned i = 0; i < instr->operands.size(); i++)
2782 {
2783 if (instr->operands[i].isConstant() && i > 0) {
2784 literal_uses = UINT32_MAX;
2785 break;
2786 }
2787 if (!instr->operands[i].isTemp())
2788 continue;
2789 /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */
2790 if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) {
2791 if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal()) {
2792 literal_uses = ctx.uses[instr->operands[i].tempId()];
2793 literal_idx = i;
2794 } else {
2795 literal_uses = UINT32_MAX;
2796 }
2797 sgpr_used = true;
2798 /* don't break because we still need to check constants */
2799 } else if (!sgpr_used &&
2800 ctx.info[instr->operands[i].tempId()].is_literal() &&
2801 ctx.uses[instr->operands[i].tempId()] < literal_uses) {
2802 literal_uses = ctx.uses[instr->operands[i].tempId()];
2803 literal_idx = i;
2804 }
2805 }
2806
2807 /* Limit the number of literals to apply to not increase the code
2808 * size too much, but always apply literals for v_mad->v_madak
2809 * because both instructions are 64-bit and this doesn't increase
2810 * code size.
2811 * TODO: try to apply the literals earlier to lower the number of
2812 * uses below threshold
2813 */
2814 if (literal_uses < threshold || literal_idx == 2) {
2815 ctx.uses[instr->operands[literal_idx].tempId()]--;
2816 mad_info->check_literal = true;
2817 mad_info->literal_idx = literal_idx;
2818 return;
2819 }
2820 }
2821 }
2822
2823 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
2824 if (instr->format == Format::PSEUDO_BRANCH &&
2825 instr->operands.size() &&
2826 instr->operands[0].isTemp()) {
2827 ctx.info[instr->operands[0].tempId()].set_scc_needed();
2828 return;
2829 } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
2830 instr->opcode == aco_opcode::s_cselect_b32) &&
2831 instr->operands[2].isTemp()) {
2832 ctx.info[instr->operands[2].tempId()].set_scc_needed();
2833 }
2834
2835 /* check for literals */
2836 if (!instr->isSALU() && !instr->isVALU())
2837 return;
2838
2839 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
2840 if (instr->definitions.size() &&
2841 ctx.uses[instr->definitions[0].tempId()] == 0 &&
2842 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
2843 bool transform_done = to_uniform_bool_instr(ctx, instr);
2844
2845 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
2846 /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
2847 uint32_t def0_id = instr->definitions[0].getTemp().id();
2848 uint32_t def1_id = instr->definitions[1].getTemp().id();
2849 instr->definitions[0].setTemp(Temp(def1_id, s1));
2850 instr->definitions[1].setTemp(Temp(def0_id, s1));
2851 }
2852
2853 return;
2854 }
2855
2856 if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
2857 return; /* some encodings can't ever take literals */
2858
2859 /* we do not apply the literals yet as we don't know if it is profitable */
2860 Operand current_literal(s1);
2861
2862 unsigned literal_id = 0;
2863 unsigned literal_uses = UINT32_MAX;
2864 Operand literal(s1);
2865 unsigned num_operands = 1;
2866 if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr)))
2867 num_operands = instr->operands.size();
2868 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
2869 else if (instr->isVALU() && instr->operands.size() >= 3)
2870 return;
2871
2872 unsigned sgpr_ids[2] = {0, 0};
2873 bool is_literal_sgpr = false;
2874 uint32_t mask = 0;
2875
2876 /* choose a literal to apply */
2877 for (unsigned i = 0; i < num_operands; i++) {
2878 Operand op = instr->operands[i];
2879
2880 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
2881 op.tempId() != sgpr_ids[0])
2882 sgpr_ids[!!sgpr_ids[0]] = op.tempId();
2883
2884 if (op.isLiteral()) {
2885 current_literal = op;
2886 continue;
2887 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal()) {
2888 continue;
2889 }
2890
2891 if (!alu_can_accept_constant(instr->opcode, i))
2892 continue;
2893
2894 if (ctx.uses[op.tempId()] < literal_uses) {
2895 is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
2896 mask = 0;
2897 literal = Operand(ctx.info[op.tempId()].val);
2898 literal_uses = ctx.uses[op.tempId()];
2899 literal_id = op.tempId();
2900 }
2901
2902 mask |= (op.tempId() == literal_id) << i;
2903 }
2904
2905
2906 /* don't go over the constant bus limit */
2907 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2908 instr->opcode == aco_opcode::v_lshrrev_b64 ||
2909 instr->opcode == aco_opcode::v_ashrrev_i64;
2910 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
2911 if (ctx.program->chip_class >= GFX10 && !is_shift64)
2912 const_bus_limit = 2;
2913
2914 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2915 if (num_sgprs == const_bus_limit && !is_literal_sgpr)
2916 return;
2917
2918 if (literal_id && literal_uses < threshold &&
2919 (current_literal.isUndefined() ||
2920 (current_literal.size() == literal.size() &&
2921 current_literal.constantValue() == literal.constantValue()))) {
2922 /* mark the literal to be applied */
2923 while (mask) {
2924 unsigned i = u_bit_scan(&mask);
2925 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
2926 ctx.uses[instr->operands[i].tempId()]--;
2927 }
2928 }
2929 }
2930
2931
2932 void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
2933 {
2934 /* Cleanup Dead Instructions */
2935 if (!instr)
2936 return;
2937
2938 /* apply literals on MAD */
2939 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
2940 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
2941 if (info->check_literal &&
2942 (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
2943 aco_ptr<Instruction> new_mad;
2944
2945 aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
2946 if (instr->opcode == aco_opcode::v_fma_f32)
2947 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
2948 else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
2949 new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
2950 else if (instr->opcode == aco_opcode::v_fma_f16)
2951 new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
2952
2953 new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
2954 if (info->literal_idx == 2) { /* add literal -> madak */
2955 new_mad->operands[0] = instr->operands[0];
2956 new_mad->operands[1] = instr->operands[1];
2957 } else { /* mul literal -> madmk */
2958 new_mad->operands[0] = instr->operands[1 - info->literal_idx];
2959 new_mad->operands[1] = instr->operands[2];
2960 }
2961 new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val);
2962 new_mad->definitions[0] = instr->definitions[0];
2963 ctx.instructions.emplace_back(std::move(new_mad));
2964 return;
2965 }
2966 }
2967
2968 /* apply literals on other SALU/VALU */
2969 if (instr->isSALU() || instr->isVALU()) {
2970 for (unsigned i = 0; i < instr->operands.size(); i++) {
2971 Operand op = instr->operands[i];
2972 if (op.isTemp() && ctx.info[op.tempId()].is_literal() && ctx.uses[op.tempId()] == 0) {
2973 Operand literal(ctx.info[op.tempId()].val);
2974 if (instr->isVALU() && i > 0)
2975 to_VOP3(ctx, instr);
2976 instr->operands[i] = literal;
2977 }
2978 }
2979 }
2980
2981 ctx.instructions.emplace_back(std::move(instr));
2982 }
2983
2984
2985 void optimize(Program* program)
2986 {
2987 opt_ctx ctx;
2988 ctx.program = program;
2989 std::vector<ssa_info> info(program->peekAllocationId());
2990 ctx.info = info.data();
2991
2992 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
2993 for (Block& block : program->blocks) {
2994 for (aco_ptr<Instruction>& instr : block.instructions)
2995 label_instruction(ctx, block, instr);
2996 }
2997
2998 ctx.uses = dead_code_analysis(program);
2999
3000 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
3001 for (Block& block : program->blocks) {
3002 for (aco_ptr<Instruction>& instr : block.instructions)
3003 combine_instruction(ctx, block, instr);
3004 }
3005
3006 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
3007 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) {
3008 Block* block = &(*it);
3009 for (std::vector<aco_ptr<Instruction>>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it)
3010 select_instruction(ctx, *it);
3011 }
3012
3013 /* 4. Add literals to instructions */
3014 for (Block& block : program->blocks) {
3015 ctx.instructions.clear();
3016 for (aco_ptr<Instruction>& instr : block.instructions)
3017 apply_literals(ctx, instr);
3018 block.instructions.swap(ctx.instructions);
3019 }
3020
3021 }
3022
3023 }