2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "aco_builder.h"
27 #include "util/u_math.h"
33 enum WQMState
: uint8_t {
36 WQM
= 1 << 1, /* with control flow applied */
37 Preserve_WQM
= 1 << 2,
38 Exact_Branch
= 1 << 3,
41 enum mask_type
: uint8_t {
42 mask_type_global
= 1 << 0,
43 mask_type_exact
= 1 << 1,
44 mask_type_wqm
= 1 << 2,
45 mask_type_loop
= 1 << 3, /* active lanes of a loop */
46 mask_type_initial
= 1 << 4, /* initially active lanes */
51 /* state for WQM propagation */
52 std::set
<unsigned> worklist
;
53 std::vector
<uint16_t> defined_in
;
54 std::vector
<bool> needs_wqm
;
55 std::vector
<bool> branch_wqm
; /* true if the branch condition in this block should be in wqm */
58 wqm_ctx(Program
* program
) : program(program
),
59 defined_in(program
->peekAllocationId(), 0xFFFF),
60 needs_wqm(program
->peekAllocationId()),
61 branch_wqm(program
->blocks
.size()),
65 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++)
72 uint16_t num_exec_masks
;
74 bool has_divergent_break
;
75 bool has_divergent_continue
;
76 bool has_discard
; /* has a discard or demote */
77 loop_info(Block
* b
, uint16_t num
, uint8_t needs
, bool breaks
, bool cont
, bool discard
) :
78 loop_header(b
), num_exec_masks(num
), needs(needs
), has_divergent_break(breaks
),
79 has_divergent_continue(cont
), has_discard(discard
) {}
83 std::vector
<std::pair
<Temp
, uint8_t>> exec
;
84 std::vector
<WQMState
> instr_needs
;
86 uint8_t ever_again_needs
;
93 std::vector
<block_info
> info
;
94 std::vector
<loop_info
> loop
;
95 bool handle_wqm
= false;
96 exec_ctx(Program
*program
) : program(program
), info(program
->blocks
.size()) {}
99 bool pred_by_exec_mask(aco_ptr
<Instruction
>& instr
) {
101 return instr
->reads_exec();
102 if (instr
->format
== Format::SMEM
|| instr
->isSALU())
104 if (instr
->format
== Format::PSEUDO_BARRIER
)
107 if (instr
->format
== Format::PSEUDO
) {
108 switch (instr
->opcode
) {
109 case aco_opcode::p_create_vector
:
110 case aco_opcode::p_extract_vector
:
111 case aco_opcode::p_split_vector
:
112 for (Definition def
: instr
->definitions
) {
113 if (def
.getTemp().type() == RegType::vgpr
)
117 case aco_opcode::p_spill
:
118 case aco_opcode::p_reload
:
125 if (instr
->opcode
== aco_opcode::v_readlane_b32
||
126 instr
->opcode
== aco_opcode::v_readlane_b32_e64
||
127 instr
->opcode
== aco_opcode::v_writelane_b32
||
128 instr
->opcode
== aco_opcode::v_writelane_b32_e64
)
134 bool needs_exact(aco_ptr
<Instruction
>& instr
) {
135 if (instr
->format
== Format::MUBUF
) {
136 MUBUF_instruction
*mubuf
= static_cast<MUBUF_instruction
*>(instr
.get());
137 return mubuf
->disable_wqm
;
138 } else if (instr
->format
== Format::MTBUF
) {
139 MTBUF_instruction
*mtbuf
= static_cast<MTBUF_instruction
*>(instr
.get());
140 return mtbuf
->disable_wqm
;
141 } else if (instr
->format
== Format::MIMG
) {
142 MIMG_instruction
*mimg
= static_cast<MIMG_instruction
*>(instr
.get());
143 return mimg
->disable_wqm
;
144 } else if (instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
) {
145 FLAT_instruction
*flat
= static_cast<FLAT_instruction
*>(instr
.get());
146 return flat
->disable_wqm
;
148 return instr
->format
== Format::EXP
|| instr
->opcode
== aco_opcode::p_fs_buffer_store_smem
;
152 void set_needs_wqm(wqm_ctx
&ctx
, Temp tmp
)
154 if (!ctx
.needs_wqm
[tmp
.id()]) {
155 ctx
.needs_wqm
[tmp
.id()] = true;
156 if (ctx
.defined_in
[tmp
.id()] != 0xFFFF)
157 ctx
.worklist
.insert(ctx
.defined_in
[tmp
.id()]);
161 void mark_block_wqm(wqm_ctx
&ctx
, unsigned block_idx
)
163 if (ctx
.branch_wqm
[block_idx
])
166 ctx
.branch_wqm
[block_idx
] = true;
167 Block
& block
= ctx
.program
->blocks
[block_idx
];
169 /* TODO: this sets more branch conditions to WQM than it needs to
170 * it should be enough to stop at the "exec mask top level" */
171 if (block
.kind
& block_kind_top_level
)
174 for (unsigned pred_idx
: block
.logical_preds
)
175 mark_block_wqm(ctx
, pred_idx
);
178 void get_block_needs(wqm_ctx
&ctx
, exec_ctx
&exec_ctx
, Block
* block
)
180 block_info
& info
= exec_ctx
.info
[block
->index
];
182 std::vector
<WQMState
> instr_needs(block
->instructions
.size());
184 if (block
->kind
& block_kind_top_level
) {
185 if (ctx
.loop
&& ctx
.wqm
) {
186 unsigned block_idx
= block
->index
+ 1;
187 while (!(ctx
.program
->blocks
[block_idx
].kind
& block_kind_top_level
)) {
188 /* flag all break conditions as WQM:
189 * the conditions might be computed outside the nested CF */
190 if (ctx
.program
->blocks
[block_idx
].kind
& block_kind_break
)
191 mark_block_wqm(ctx
, block_idx
);
192 /* flag all blocks as WQM to ensure we enter all (nested) loops in WQM */
193 exec_ctx
.info
[block_idx
].block_needs
|= WQM
;
196 } else if (ctx
.loop
&& !ctx
.wqm
) {
197 /* Ensure a branch never results in an exec mask with only helper
198 * invocations (which can cause a loop to repeat infinitively if it's
199 * break branches are done in exact). */
200 unsigned block_idx
= block
->index
;
202 if ((ctx
.program
->blocks
[block_idx
].kind
& block_kind_branch
))
203 exec_ctx
.info
[block_idx
].block_needs
|= Exact_Branch
;
205 } while (!(ctx
.program
->blocks
[block_idx
].kind
& block_kind_top_level
));
212 for (int i
= block
->instructions
.size() - 1; i
>= 0; --i
) {
213 aco_ptr
<Instruction
>& instr
= block
->instructions
[i
];
215 WQMState needs
= needs_exact(instr
) ? Exact
: Unspecified
;
216 bool propagate_wqm
= instr
->opcode
== aco_opcode::p_wqm
;
217 bool preserve_wqm
= instr
->opcode
== aco_opcode::p_discard_if
;
218 bool pred_by_exec
= pred_by_exec_mask(instr
);
219 for (const Definition
& definition
: instr
->definitions
) {
220 if (!definition
.isTemp())
222 const unsigned def
= definition
.tempId();
223 ctx
.defined_in
[def
] = block
->index
;
224 if (needs
== Unspecified
&& ctx
.needs_wqm
[def
]) {
225 needs
= pred_by_exec
? WQM
: Unspecified
;
226 propagate_wqm
= true;
230 if (instr
->format
== Format::PSEUDO_BRANCH
&& ctx
.branch_wqm
[block
->index
]) {
232 propagate_wqm
= true;
236 for (const Operand
& op
: instr
->operands
) {
238 set_needs_wqm(ctx
, op
.getTemp());
241 } else if (preserve_wqm
&& info
.block_needs
& WQM
) {
242 needs
= Preserve_WQM
;
245 /* ensure the condition controlling the control flow for this phi is in WQM */
246 if (needs
== WQM
&& instr
->opcode
== aco_opcode::p_phi
) {
247 for (unsigned pred_idx
: block
->logical_preds
) {
248 mark_block_wqm(ctx
, pred_idx
);
249 exec_ctx
.info
[pred_idx
].logical_end_wqm
= true;
250 ctx
.worklist
.insert(pred_idx
);
254 if ((instr
->opcode
== aco_opcode::p_logical_end
&& info
.logical_end_wqm
) ||
255 instr
->opcode
== aco_opcode::p_wqm
) {
256 assert(needs
!= Exact
);
260 instr_needs
[i
] = needs
;
261 info
.block_needs
|= needs
;
264 info
.instr_needs
= instr_needs
;
266 /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
267 * <cond> should be computed in WQM */
268 if (info
.block_needs
& WQM
&& !(block
->kind
& block_kind_top_level
)) {
269 for (unsigned pred_idx
: block
->logical_preds
)
270 mark_block_wqm(ctx
, pred_idx
);
273 if (block
->kind
& block_kind_loop_header
)
277 void calculate_wqm_needs(exec_ctx
& exec_ctx
)
279 wqm_ctx
ctx(exec_ctx
.program
);
281 while (!ctx
.worklist
.empty()) {
282 unsigned block_index
= *std::prev(ctx
.worklist
.end());
283 ctx
.worklist
.erase(std::prev(ctx
.worklist
.end()));
285 get_block_needs(ctx
, exec_ctx
, &exec_ctx
.program
->blocks
[block_index
]);
288 uint8_t ever_again_needs
= 0;
289 for (int i
= exec_ctx
.program
->blocks
.size() - 1; i
>= 0; i
--) {
290 exec_ctx
.info
[i
].ever_again_needs
= ever_again_needs
;
291 Block
& block
= exec_ctx
.program
->blocks
[i
];
293 if (block
.kind
& block_kind_needs_lowering
)
294 exec_ctx
.info
[i
].block_needs
|= Exact
;
296 /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
297 if ((block
.kind
& block_kind_discard
||
298 block
.kind
& block_kind_uses_discard_if
) &&
299 ever_again_needs
& WQM
)
300 exec_ctx
.info
[i
].block_needs
|= Preserve_WQM
;
302 ever_again_needs
|= exec_ctx
.info
[i
].block_needs
& ~Exact_Branch
;
303 if (block
.kind
& block_kind_discard
||
304 block
.kind
& block_kind_uses_discard_if
||
305 block
.kind
& block_kind_uses_demote
)
306 ever_again_needs
|= Exact
;
308 /* don't propagate WQM preservation further than the next top_level block */
309 if (block
.kind
& block_kind_top_level
)
310 ever_again_needs
&= ~Preserve_WQM
;
312 exec_ctx
.info
[i
].block_needs
&= ~Preserve_WQM
;
314 exec_ctx
.handle_wqm
= true;
317 void transition_to_WQM(exec_ctx
& ctx
, Builder bld
, unsigned idx
)
319 if (ctx
.info
[idx
].exec
.back().second
& mask_type_wqm
)
321 if (ctx
.info
[idx
].exec
.back().second
& mask_type_global
) {
322 Temp exec_mask
= ctx
.info
[idx
].exec
.back().first
;
323 /* TODO: we might generate better code if we pass the uncopied "exec_mask"
324 * directly to the s_wqm (we still need to keep this parallelcopy for
325 * potential later uses of exec_mask though). We currently can't do this
326 * because of a RA bug. */
327 exec_mask
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
), bld
.exec(exec_mask
));
328 ctx
.info
[idx
].exec
.back().first
= exec_mask
;
330 exec_mask
= bld
.sop1(Builder::s_wqm
, bld
.def(bld
.lm
, exec
), bld
.def(s1
, scc
), exec_mask
);
331 ctx
.info
[idx
].exec
.emplace_back(exec_mask
, mask_type_global
| mask_type_wqm
);
334 /* otherwise, the WQM mask should be one below the current mask */
335 ctx
.info
[idx
].exec
.pop_back();
336 assert(ctx
.info
[idx
].exec
.back().second
& mask_type_wqm
);
337 assert(ctx
.info
[idx
].exec
.back().first
.size() == bld
.lm
.size());
338 ctx
.info
[idx
].exec
.back().first
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
),
339 ctx
.info
[idx
].exec
.back().first
);
342 void transition_to_Exact(exec_ctx
& ctx
, Builder bld
, unsigned idx
)
344 if (ctx
.info
[idx
].exec
.back().second
& mask_type_exact
)
346 /* We can't remove the loop exec mask, because that can cause exec.size() to
347 * be less than num_exec_masks. The loop exec mask also needs to be kept
348 * around for various uses. */
349 if ((ctx
.info
[idx
].exec
.back().second
& mask_type_global
) &&
350 !(ctx
.info
[idx
].exec
.back().second
& mask_type_loop
)) {
351 ctx
.info
[idx
].exec
.pop_back();
352 assert(ctx
.info
[idx
].exec
.back().second
& mask_type_exact
);
353 assert(ctx
.info
[idx
].exec
.back().first
.size() == bld
.lm
.size());
354 ctx
.info
[idx
].exec
.back().first
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
),
355 ctx
.info
[idx
].exec
.back().first
);
358 /* otherwise, we create an exact mask and push to the stack */
359 Temp wqm
= ctx
.info
[idx
].exec
.back().first
;
360 Temp exact
= bld
.tmp(bld
.lm
);
361 wqm
= bld
.sop1(Builder::s_and_saveexec
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
362 bld
.exec(Definition(exact
)), ctx
.info
[idx
].exec
[0].first
, bld
.exec(wqm
));
363 ctx
.info
[idx
].exec
.back().first
= wqm
;
364 ctx
.info
[idx
].exec
.emplace_back(exact
, mask_type_exact
);
367 unsigned add_coupling_code(exec_ctx
& ctx
, Block
* block
,
368 std::vector
<aco_ptr
<Instruction
>>& instructions
)
370 unsigned idx
= block
->index
;
371 Builder
bld(ctx
.program
, &instructions
);
372 std::vector
<unsigned>& preds
= block
->linear_preds
;
376 aco_ptr
<Instruction
>& startpgm
= block
->instructions
[0];
377 assert(startpgm
->opcode
== aco_opcode::p_startpgm
);
378 Temp exec_mask
= startpgm
->definitions
.back().getTemp();
379 bld
.insert(std::move(startpgm
));
381 /* exec seems to need to be manually initialized with combined shaders */
382 if (util_bitcount(ctx
.program
->stage
& sw_mask
) > 1 || (ctx
.program
->stage
& hw_ngg_gs
)) {
383 bld
.sop1(Builder::s_mov
, bld
.exec(Definition(exec_mask
)), bld
.lm
== s2
? Operand(UINT64_MAX
) : Operand(UINT32_MAX
));
384 instructions
[0]->definitions
.pop_back();
387 if (ctx
.handle_wqm
) {
388 ctx
.info
[0].exec
.emplace_back(exec_mask
, mask_type_global
| mask_type_exact
| mask_type_initial
);
389 /* if this block only needs WQM, initialize already */
390 if (ctx
.info
[0].block_needs
== WQM
)
391 transition_to_WQM(ctx
, bld
, 0);
393 uint8_t mask
= mask_type_global
;
394 if (ctx
.program
->needs_wqm
) {
395 exec_mask
= bld
.sop1(Builder::s_wqm
, bld
.def(bld
.lm
, exec
), bld
.def(s1
, scc
), bld
.exec(exec_mask
));
396 mask
|= mask_type_wqm
;
398 mask
|= mask_type_exact
;
400 ctx
.info
[0].exec
.emplace_back(exec_mask
, mask
);
406 /* loop entry block */
407 if (block
->kind
& block_kind_loop_header
) {
408 assert(preds
[0] == idx
- 1);
409 ctx
.info
[idx
].exec
= ctx
.info
[idx
- 1].exec
;
410 loop_info
& info
= ctx
.loop
.back();
411 while (ctx
.info
[idx
].exec
.size() > info
.num_exec_masks
)
412 ctx
.info
[idx
].exec
.pop_back();
414 /* create ssa names for outer exec masks */
415 if (info
.has_discard
) {
416 aco_ptr
<Pseudo_instruction
> phi
;
417 for (int i
= 0; i
< info
.num_exec_masks
- 1; i
++) {
418 phi
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_linear_phi
, Format::PSEUDO
, preds
.size(), 1));
419 phi
->definitions
[0] = bld
.def(bld
.lm
);
420 phi
->operands
[0] = Operand(ctx
.info
[preds
[0]].exec
[i
].first
);
421 ctx
.info
[idx
].exec
[i
].first
= bld
.insert(std::move(phi
));
425 /* create ssa name for restore mask */
426 if (info
.has_divergent_break
) {
427 /* this phi might be trivial but ensures a parallelcopy on the loop header */
428 aco_ptr
<Pseudo_instruction
> phi
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_linear_phi
, Format::PSEUDO
, preds
.size(), 1)};
429 phi
->definitions
[0] = bld
.def(bld
.lm
);
430 phi
->operands
[0] = Operand(ctx
.info
[preds
[0]].exec
[info
.num_exec_masks
- 1].first
);
431 ctx
.info
[idx
].exec
.back().first
= bld
.insert(std::move(phi
));
434 /* create ssa name for loop active mask */
435 aco_ptr
<Pseudo_instruction
> phi
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_linear_phi
, Format::PSEUDO
, preds
.size(), 1)};
436 if (info
.has_divergent_continue
)
437 phi
->definitions
[0] = bld
.def(bld
.lm
);
439 phi
->definitions
[0] = bld
.def(bld
.lm
, exec
);
440 phi
->operands
[0] = Operand(ctx
.info
[preds
[0]].exec
.back().first
);
441 Temp loop_active
= bld
.insert(std::move(phi
));
443 if (info
.has_divergent_break
) {
444 uint8_t mask_type
= (ctx
.info
[idx
].exec
.back().second
& (mask_type_wqm
| mask_type_exact
)) | mask_type_loop
;
445 ctx
.info
[idx
].exec
.emplace_back(loop_active
, mask_type
);
447 ctx
.info
[idx
].exec
.back().first
= loop_active
;
448 ctx
.info
[idx
].exec
.back().second
|= mask_type_loop
;
451 /* create a parallelcopy to move the active mask to exec */
453 if (info
.has_divergent_continue
) {
454 while (block
->instructions
[i
]->opcode
!= aco_opcode::p_logical_start
) {
455 bld
.insert(std::move(block
->instructions
[i
]));
458 uint8_t mask_type
= ctx
.info
[idx
].exec
.back().second
& (mask_type_wqm
| mask_type_exact
);
459 assert(ctx
.info
[idx
].exec
.back().first
.size() == bld
.lm
.size());
460 ctx
.info
[idx
].exec
.emplace_back(bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
),
461 ctx
.info
[idx
].exec
.back().first
), mask_type
);
467 /* loop exit block */
468 if (block
->kind
& block_kind_loop_exit
) {
469 Block
* header
= ctx
.loop
.back().loop_header
;
470 loop_info
& info
= ctx
.loop
.back();
472 for (ASSERTED
unsigned pred
: preds
)
473 assert(ctx
.info
[pred
].exec
.size() >= info
.num_exec_masks
);
475 /* fill the loop header phis */
476 std::vector
<unsigned>& header_preds
= header
->linear_preds
;
478 if (info
.has_discard
) {
479 while (k
< info
.num_exec_masks
- 1) {
480 aco_ptr
<Instruction
>& phi
= header
->instructions
[k
];
481 assert(phi
->opcode
== aco_opcode::p_linear_phi
);
482 for (unsigned i
= 1; i
< phi
->operands
.size(); i
++)
483 phi
->operands
[i
] = Operand(ctx
.info
[header_preds
[i
]].exec
[k
].first
);
487 aco_ptr
<Instruction
>& phi
= header
->instructions
[k
++];
488 assert(phi
->opcode
== aco_opcode::p_linear_phi
);
489 for (unsigned i
= 1; i
< phi
->operands
.size(); i
++)
490 phi
->operands
[i
] = Operand(ctx
.info
[header_preds
[i
]].exec
[info
.num_exec_masks
- 1].first
);
492 if (info
.has_divergent_break
) {
493 aco_ptr
<Instruction
>& phi
= header
->instructions
[k
];
494 assert(phi
->opcode
== aco_opcode::p_linear_phi
);
495 for (unsigned i
= 1; i
< phi
->operands
.size(); i
++)
496 phi
->operands
[i
] = Operand(ctx
.info
[header_preds
[i
]].exec
[info
.num_exec_masks
].first
);
499 assert(!(block
->kind
& block_kind_top_level
) || info
.num_exec_masks
<= 2);
501 /* create the loop exit phis if not trivial */
502 bool need_parallelcopy
= false;
503 for (unsigned k
= 0; k
< info
.num_exec_masks
; k
++) {
504 Temp same
= ctx
.info
[preds
[0]].exec
[k
].first
;
505 uint8_t type
= ctx
.info
[header_preds
[0]].exec
[k
].second
;
508 for (unsigned i
= 1; i
< preds
.size() && trivial
; i
++) {
509 if (ctx
.info
[preds
[i
]].exec
[k
].first
!= same
)
513 if (k
== info
.num_exec_masks
- 1u) {
514 bool all_liveout_exec
= true;
515 bool all_not_liveout_exec
= true;
516 for (unsigned pred
: preds
) {
517 all_liveout_exec
= all_liveout_exec
&& same
== ctx
.program
->blocks
[pred
].live_out_exec
;
518 all_not_liveout_exec
= all_not_liveout_exec
&& same
!= ctx
.program
->blocks
[pred
].live_out_exec
;
520 if (!all_liveout_exec
&& !all_not_liveout_exec
)
522 else if (all_not_liveout_exec
)
523 need_parallelcopy
= true;
525 need_parallelcopy
|= !trivial
;
529 ctx
.info
[idx
].exec
.emplace_back(same
, type
);
531 /* create phi for loop footer */
532 aco_ptr
<Pseudo_instruction
> phi
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_linear_phi
, Format::PSEUDO
, preds
.size(), 1)};
533 phi
->definitions
[0] = bld
.def(bld
.lm
);
534 if (k
== info
.num_exec_masks
- 1u) {
535 phi
->definitions
[0].setFixed(exec
);
536 need_parallelcopy
= false;
538 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++)
539 phi
->operands
[i
] = Operand(ctx
.info
[preds
[i
]].exec
[k
].first
);
540 ctx
.info
[idx
].exec
.emplace_back(bld
.insert(std::move(phi
)), type
);
543 assert(ctx
.info
[idx
].exec
.size() == info
.num_exec_masks
);
545 /* create a parallelcopy to move the live mask to exec */
547 while (block
->instructions
[i
]->opcode
!= aco_opcode::p_logical_start
) {
548 bld
.insert(std::move(block
->instructions
[i
]));
552 if (ctx
.handle_wqm
) {
553 if (block
->kind
& block_kind_top_level
&& ctx
.info
[idx
].exec
.size() == 2) {
554 if ((ctx
.info
[idx
].block_needs
| ctx
.info
[idx
].ever_again_needs
) == 0 ||
555 (ctx
.info
[idx
].block_needs
| ctx
.info
[idx
].ever_again_needs
) == Exact
) {
556 ctx
.info
[idx
].exec
.back().second
|= mask_type_global
;
557 transition_to_Exact(ctx
, bld
, idx
);
558 ctx
.handle_wqm
= false;
561 if (ctx
.info
[idx
].block_needs
== WQM
)
562 transition_to_WQM(ctx
, bld
, idx
);
563 else if (ctx
.info
[idx
].block_needs
== Exact
)
564 transition_to_Exact(ctx
, bld
, idx
);
567 assert(ctx
.info
[idx
].exec
.back().first
.size() == bld
.lm
.size());
568 if (need_parallelcopy
) {
569 /* only create this parallelcopy is needed, since the operand isn't
570 * fixed to exec which causes the spiller to miscalculate register demand */
571 /* TODO: Fix register_demand calculation for spilling on loop exits.
572 * The problem is only mitigated because the register demand could be
573 * higher if the exec phi doesn't get assigned to exec. */
574 ctx
.info
[idx
].exec
.back().first
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
),
575 ctx
.info
[idx
].exec
.back().first
);
582 if (preds
.size() == 1) {
583 ctx
.info
[idx
].exec
= ctx
.info
[preds
[0]].exec
;
585 assert(preds
.size() == 2);
586 /* if one of the predecessors ends in exact mask, we pop it from stack */
587 unsigned num_exec_masks
= std::min(ctx
.info
[preds
[0]].exec
.size(),
588 ctx
.info
[preds
[1]].exec
.size());
589 if (block
->kind
& block_kind_top_level
&& !(block
->kind
& block_kind_merge
))
590 num_exec_masks
= std::min(num_exec_masks
, 2u);
592 /* create phis for diverged exec masks */
593 for (unsigned i
= 0; i
< num_exec_masks
; i
++) {
594 bool in_exec
= i
== num_exec_masks
- 1 && !(block
->kind
& block_kind_merge
);
595 if (!in_exec
&& ctx
.info
[preds
[0]].exec
[i
].first
== ctx
.info
[preds
[1]].exec
[i
].first
) {
596 assert(ctx
.info
[preds
[0]].exec
[i
].second
== ctx
.info
[preds
[1]].exec
[i
].second
);
597 ctx
.info
[idx
].exec
.emplace_back(ctx
.info
[preds
[0]].exec
[i
]);
601 Temp phi
= bld
.pseudo(aco_opcode::p_linear_phi
, in_exec
? bld
.def(bld
.lm
, exec
) : bld
.def(bld
.lm
),
602 ctx
.info
[preds
[0]].exec
[i
].first
,
603 ctx
.info
[preds
[1]].exec
[i
].first
);
604 uint8_t mask_type
= ctx
.info
[preds
[0]].exec
[i
].second
& ctx
.info
[preds
[1]].exec
[i
].second
;
605 ctx
.info
[idx
].exec
.emplace_back(phi
, mask_type
);
610 while (block
->instructions
[i
]->opcode
== aco_opcode::p_phi
||
611 block
->instructions
[i
]->opcode
== aco_opcode::p_linear_phi
) {
612 bld
.insert(std::move(block
->instructions
[i
]));
616 if (block
->kind
& block_kind_merge
)
617 ctx
.info
[idx
].exec
.pop_back();
619 if (block
->kind
& block_kind_top_level
&& ctx
.info
[idx
].exec
.size() == 3) {
620 assert(ctx
.info
[idx
].exec
.back().second
== mask_type_exact
);
621 assert(block
->kind
& block_kind_merge
);
622 ctx
.info
[idx
].exec
.pop_back();
625 /* try to satisfy the block's needs */
626 if (ctx
.handle_wqm
) {
627 if (block
->kind
& block_kind_top_level
&& ctx
.info
[idx
].exec
.size() == 2) {
628 if ((ctx
.info
[idx
].block_needs
| ctx
.info
[idx
].ever_again_needs
) == 0 ||
629 (ctx
.info
[idx
].block_needs
| ctx
.info
[idx
].ever_again_needs
) == Exact
) {
630 ctx
.info
[idx
].exec
.back().second
|= mask_type_global
;
631 transition_to_Exact(ctx
, bld
, idx
);
632 ctx
.handle_wqm
= false;
635 if (ctx
.info
[idx
].block_needs
== WQM
)
636 transition_to_WQM(ctx
, bld
, idx
);
637 else if (ctx
.info
[idx
].block_needs
== Exact
)
638 transition_to_Exact(ctx
, bld
, idx
);
641 if (block
->kind
& block_kind_merge
) {
642 Temp restore
= ctx
.info
[idx
].exec
.back().first
;
643 assert(restore
.size() == bld
.lm
.size());
644 ctx
.info
[idx
].exec
.back().first
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
), restore
);
650 void lower_fs_buffer_store_smem(Builder
& bld
, bool need_check
, aco_ptr
<Instruction
>& instr
, Temp cur_exec
)
652 Operand offset
= instr
->operands
[1];
654 /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
655 Temp nonempty
= bld
.sopc(Builder::s_cmp_lg
, bld
.def(s1
, scc
), cur_exec
, Operand(0u));
657 if (offset
.isLiteral())
658 offset
= bld
.sop1(aco_opcode::s_mov_b32
, bld
.def(s1
), offset
);
660 offset
= bld
.sop2(aco_opcode::s_cselect_b32
, bld
.hint_m0(bld
.def(s1
)),
661 offset
, Operand(UINT32_MAX
), bld
.scc(nonempty
));
662 } else if (offset
.isConstant() && offset
.constantValue() > 0xFFFFF) {
663 offset
= bld
.sop1(aco_opcode::s_mov_b32
, bld
.hint_m0(bld
.def(s1
)), offset
);
665 if (!offset
.isConstant())
668 switch (instr
->operands
[2].size()) {
670 instr
->opcode
= aco_opcode::s_buffer_store_dword
;
673 instr
->opcode
= aco_opcode::s_buffer_store_dwordx2
;
676 instr
->opcode
= aco_opcode::s_buffer_store_dwordx4
;
679 unreachable("Invalid SMEM buffer store size");
681 instr
->operands
[1] = offset
;
682 /* as_uniform() needs to be done here so it's done in exact mode and helper
683 * lanes don't contribute. */
684 instr
->operands
[2] = Operand(bld
.as_uniform(instr
->operands
[2]));
687 void process_instructions(exec_ctx
& ctx
, Block
* block
,
688 std::vector
<aco_ptr
<Instruction
>>& instructions
,
692 if (ctx
.info
[block
->index
].exec
.back().second
& mask_type_wqm
)
695 assert(!ctx
.handle_wqm
|| ctx
.info
[block
->index
].exec
.back().second
& mask_type_exact
);
699 /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
700 bool process
= (ctx
.handle_wqm
&&
701 (ctx
.info
[block
->index
].block_needs
& state
) !=
702 (ctx
.info
[block
->index
].block_needs
& (WQM
| Exact
))) ||
703 block
->kind
& block_kind_uses_discard_if
||
704 block
->kind
& block_kind_uses_demote
||
705 block
->kind
& block_kind_needs_lowering
;
707 std::vector
<aco_ptr
<Instruction
>>::iterator it
= std::next(block
->instructions
.begin(), idx
);
708 instructions
.insert(instructions
.end(),
709 std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(it
),
710 std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(block
->instructions
.end()));
714 Builder
bld(ctx
.program
, &instructions
);
716 for (; idx
< block
->instructions
.size(); idx
++) {
717 aco_ptr
<Instruction
> instr
= std::move(block
->instructions
[idx
]);
719 WQMState needs
= ctx
.handle_wqm
? ctx
.info
[block
->index
].instr_needs
[idx
] : Unspecified
;
721 if (instr
->opcode
== aco_opcode::p_discard_if
) {
722 if (ctx
.info
[block
->index
].block_needs
& Preserve_WQM
) {
723 assert(block
->kind
& block_kind_top_level
);
724 transition_to_WQM(ctx
, bld
, block
->index
);
725 ctx
.info
[block
->index
].exec
.back().second
&= ~mask_type_global
;
727 int num
= ctx
.info
[block
->index
].exec
.size();
729 Operand cond
= instr
->operands
[0];
730 for (int i
= num
- 1; i
>= 0; i
--) {
731 Instruction
*andn2
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
732 ctx
.info
[block
->index
].exec
[i
].first
, cond
);
734 andn2
->operands
[0].setFixed(exec
);
735 andn2
->definitions
[0].setFixed(exec
);
738 instr
->opcode
= aco_opcode::p_exit_early_if
;
739 instr
->operands
[0] = bld
.scc(andn2
->definitions
[1].getTemp());
741 ctx
.info
[block
->index
].exec
[i
].first
= andn2
->definitions
[0].getTemp();
743 assert(!ctx
.handle_wqm
|| (ctx
.info
[block
->index
].exec
[0].second
& mask_type_wqm
) == 0);
745 } else if (needs
== WQM
&& state
!= WQM
) {
746 transition_to_WQM(ctx
, bld
, block
->index
);
748 } else if (needs
== Exact
&& state
!= Exact
) {
749 transition_to_Exact(ctx
, bld
, block
->index
);
753 if (instr
->opcode
== aco_opcode::p_is_helper
|| instr
->opcode
== aco_opcode::p_load_helper
) {
754 Definition dst
= instr
->definitions
[0];
755 assert(dst
.size() == bld
.lm
.size());
756 if (state
== Exact
) {
757 instr
.reset(create_instruction
<SOP1_instruction
>(bld
.w64or32(Builder::s_mov
), Format::SOP1
, 1, 1));
758 instr
->operands
[0] = Operand(0u);
759 instr
->definitions
[0] = dst
;
761 std::pair
<Temp
, uint8_t>& exact_mask
= ctx
.info
[block
->index
].exec
[0];
762 if (instr
->opcode
== aco_opcode::p_load_helper
&&
763 !(ctx
.info
[block
->index
].exec
[0].second
& mask_type_initial
)) {
764 /* find last initial exact mask */
765 for (int i
= block
->index
; i
>= 0; i
--) {
766 if (ctx
.program
->blocks
[i
].kind
& block_kind_top_level
&&
767 ctx
.info
[i
].exec
[0].second
& mask_type_initial
) {
768 exact_mask
= ctx
.info
[i
].exec
[0];
774 assert(instr
->opcode
== aco_opcode::p_is_helper
|| exact_mask
.second
& mask_type_initial
);
775 assert(exact_mask
.second
& mask_type_exact
);
777 instr
.reset(create_instruction
<SOP2_instruction
>(bld
.w64or32(Builder::s_andn2
), Format::SOP2
, 2, 2));
778 instr
->operands
[0] = Operand(ctx
.info
[block
->index
].exec
.back().first
); /* current exec */
779 instr
->operands
[1] = Operand(exact_mask
.first
);
780 instr
->definitions
[0] = dst
;
781 instr
->definitions
[1] = bld
.def(s1
, scc
);
783 } else if (instr
->opcode
== aco_opcode::p_demote_to_helper
) {
784 /* turn demote into discard_if with only exact masks */
785 assert((ctx
.info
[block
->index
].exec
[0].second
& (mask_type_exact
| mask_type_global
)) == (mask_type_exact
| mask_type_global
));
786 ctx
.info
[block
->index
].exec
[0].second
&= ~mask_type_initial
;
789 Temp cond
, exit_cond
;
790 if (instr
->operands
[0].isConstant()) {
791 assert(instr
->operands
[0].constantValue() == -1u);
792 /* transition to exact and set exec to zero */
793 Temp old_exec
= ctx
.info
[block
->index
].exec
.back().first
;
794 Temp new_exec
= bld
.tmp(bld
.lm
);
795 exit_cond
= bld
.tmp(s1
);
796 cond
= bld
.sop1(Builder::s_and_saveexec
, bld
.def(bld
.lm
), bld
.scc(Definition(exit_cond
)),
797 bld
.exec(Definition(new_exec
)), Operand(0u), bld
.exec(old_exec
));
799 num
= ctx
.info
[block
->index
].exec
.size() - 2;
800 if (ctx
.info
[block
->index
].exec
.back().second
& mask_type_exact
) {
801 ctx
.info
[block
->index
].exec
.back().first
= new_exec
;
803 ctx
.info
[block
->index
].exec
.back().first
= cond
;
804 ctx
.info
[block
->index
].exec
.emplace_back(new_exec
, mask_type_exact
);
807 /* demote_if: transition to exact */
808 transition_to_Exact(ctx
, bld
, block
->index
);
809 assert(instr
->operands
[0].isTemp());
810 cond
= instr
->operands
[0].getTemp();
811 num
= ctx
.info
[block
->index
].exec
.size() - 1;
814 for (int i
= num
; i
>= 0; i
--) {
815 if (ctx
.info
[block
->index
].exec
[i
].second
& mask_type_exact
) {
816 Instruction
*andn2
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
817 ctx
.info
[block
->index
].exec
[i
].first
, cond
);
818 if (i
== (int)ctx
.info
[block
->index
].exec
.size() - 1) {
819 andn2
->operands
[0].setFixed(exec
);
820 andn2
->definitions
[0].setFixed(exec
);
823 ctx
.info
[block
->index
].exec
[i
].first
= andn2
->definitions
[0].getTemp();
824 exit_cond
= andn2
->definitions
[1].getTemp();
829 instr
->opcode
= aco_opcode::p_exit_early_if
;
830 instr
->operands
[0] = bld
.scc(exit_cond
);
833 } else if (instr
->opcode
== aco_opcode::p_fs_buffer_store_smem
) {
834 bool need_check
= ctx
.info
[block
->index
].exec
.size() != 1 &&
835 !(ctx
.info
[block
->index
].exec
[ctx
.info
[block
->index
].exec
.size() - 2].second
& Exact
);
836 lower_fs_buffer_store_smem(bld
, need_check
, instr
, ctx
.info
[block
->index
].exec
.back().first
);
839 bld
.insert(std::move(instr
));
843 void add_branch_code(exec_ctx
& ctx
, Block
* block
)
845 unsigned idx
= block
->index
;
846 Builder
bld(ctx
.program
, block
);
848 if (idx
== ctx
.program
->blocks
.size() - 1)
851 /* try to disable wqm handling */
852 if (ctx
.handle_wqm
&& block
->kind
& block_kind_top_level
) {
853 if (ctx
.info
[idx
].exec
.size() == 3) {
854 assert(ctx
.info
[idx
].exec
[1].second
== mask_type_wqm
);
855 ctx
.info
[idx
].exec
.pop_back();
857 assert(ctx
.info
[idx
].exec
.size() <= 2);
859 if (ctx
.info
[idx
].ever_again_needs
== 0 ||
860 ctx
.info
[idx
].ever_again_needs
== Exact
) {
861 /* transition to Exact */
862 aco_ptr
<Instruction
> branch
= std::move(block
->instructions
.back());
863 block
->instructions
.pop_back();
864 ctx
.info
[idx
].exec
.back().second
|= mask_type_global
;
865 transition_to_Exact(ctx
, bld
, idx
);
866 bld
.insert(std::move(branch
));
867 ctx
.handle_wqm
= false;
869 } else if (ctx
.info
[idx
].block_needs
& Preserve_WQM
) {
870 /* transition to WQM and remove global flag */
871 aco_ptr
<Instruction
> branch
= std::move(block
->instructions
.back());
872 block
->instructions
.pop_back();
873 transition_to_WQM(ctx
, bld
, idx
);
874 ctx
.info
[idx
].exec
.back().second
&= ~mask_type_global
;
875 bld
.insert(std::move(branch
));
879 if (block
->kind
& block_kind_loop_preheader
) {
880 /* collect information about the succeeding loop */
881 bool has_divergent_break
= false;
882 bool has_divergent_continue
= false;
883 bool has_discard
= false;
885 unsigned loop_nest_depth
= ctx
.program
->blocks
[idx
+ 1].loop_nest_depth
;
887 for (unsigned i
= idx
+ 1; ctx
.program
->blocks
[i
].loop_nest_depth
>= loop_nest_depth
; i
++) {
888 Block
& loop_block
= ctx
.program
->blocks
[i
];
889 needs
|= ctx
.info
[i
].block_needs
;
891 if (loop_block
.kind
& block_kind_uses_discard_if
||
892 loop_block
.kind
& block_kind_discard
||
893 loop_block
.kind
& block_kind_uses_demote
)
895 if (loop_block
.loop_nest_depth
!= loop_nest_depth
)
898 if (loop_block
.kind
& block_kind_uniform
)
900 else if (loop_block
.kind
& block_kind_break
)
901 has_divergent_break
= true;
902 else if (loop_block
.kind
& block_kind_continue
)
903 has_divergent_continue
= true;
906 if (ctx
.handle_wqm
) {
908 aco_ptr
<Instruction
> branch
= std::move(block
->instructions
.back());
909 block
->instructions
.pop_back();
910 transition_to_WQM(ctx
, bld
, idx
);
911 bld
.insert(std::move(branch
));
913 aco_ptr
<Instruction
> branch
= std::move(block
->instructions
.back());
914 block
->instructions
.pop_back();
915 transition_to_Exact(ctx
, bld
, idx
);
916 bld
.insert(std::move(branch
));
920 unsigned num_exec_masks
= ctx
.info
[idx
].exec
.size();
921 if (block
->kind
& block_kind_top_level
)
922 num_exec_masks
= std::min(num_exec_masks
, 2u);
924 ctx
.loop
.emplace_back(&ctx
.program
->blocks
[block
->linear_succs
[0]],
928 has_divergent_continue
,
932 /* For normal breaks, this is the exec mask. For discard+break, it's the
933 * old exec mask before it was zero'd.
935 Operand break_cond
= bld
.exec(ctx
.info
[idx
].exec
.back().first
);
937 if (block
->kind
& block_kind_discard
) {
939 assert(block
->instructions
.back()->format
== Format::PSEUDO_BRANCH
);
940 aco_ptr
<Instruction
> branch
= std::move(block
->instructions
.back());
941 block
->instructions
.pop_back();
943 /* create a discard_if() instruction with the exec mask as condition */
945 if (ctx
.loop
.size()) {
946 /* if we're in a loop, only discard from the outer exec masks */
947 num
= ctx
.loop
.back().num_exec_masks
;
949 num
= ctx
.info
[idx
].exec
.size() - 1;
952 Temp old_exec
= ctx
.info
[idx
].exec
.back().first
;
953 Temp new_exec
= bld
.tmp(bld
.lm
);
954 Temp cond
= bld
.sop1(Builder::s_and_saveexec
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
955 bld
.exec(Definition(new_exec
)), Operand(0u), bld
.exec(old_exec
));
956 ctx
.info
[idx
].exec
.back().first
= new_exec
;
958 for (int i
= num
- 1; i
>= 0; i
--) {
959 Instruction
*andn2
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
960 ctx
.info
[block
->index
].exec
[i
].first
, cond
);
961 if (i
== (int)ctx
.info
[idx
].exec
.size() - 1)
962 andn2
->definitions
[0].setFixed(exec
);
964 bld
.pseudo(aco_opcode::p_exit_early_if
, bld
.scc(andn2
->definitions
[1].getTemp()));
965 ctx
.info
[block
->index
].exec
[i
].first
= andn2
->definitions
[0].getTemp();
967 assert(!ctx
.handle_wqm
|| (ctx
.info
[block
->index
].exec
[0].second
& mask_type_wqm
) == 0);
969 break_cond
= Operand(cond
);
970 bld
.insert(std::move(branch
));
971 /* no return here as it can be followed by a divergent break */
974 if (block
->kind
& block_kind_continue_or_break
) {
975 assert(ctx
.program
->blocks
[ctx
.program
->blocks
[block
->linear_succs
[1]].linear_succs
[0]].kind
& block_kind_loop_header
);
976 assert(ctx
.program
->blocks
[ctx
.program
->blocks
[block
->linear_succs
[0]].linear_succs
[0]].kind
& block_kind_loop_exit
);
977 assert(block
->instructions
.back()->opcode
== aco_opcode::p_branch
);
978 block
->instructions
.pop_back();
980 bool need_parallelcopy
= false;
981 while (!(ctx
.info
[idx
].exec
.back().second
& mask_type_loop
)) {
982 ctx
.info
[idx
].exec
.pop_back();
983 need_parallelcopy
= true;
986 if (need_parallelcopy
)
987 ctx
.info
[idx
].exec
.back().first
= bld
.pseudo(aco_opcode::p_parallelcopy
, bld
.def(bld
.lm
, exec
), ctx
.info
[idx
].exec
.back().first
);
988 bld
.branch(aco_opcode::p_cbranch_nz
, bld
.hint_vcc(bld
.def(s2
)), bld
.exec(ctx
.info
[idx
].exec
.back().first
), block
->linear_succs
[1], block
->linear_succs
[0]);
992 if (block
->kind
& block_kind_uniform
) {
993 Pseudo_branch_instruction
* branch
= static_cast<Pseudo_branch_instruction
*>(block
->instructions
.back().get());
994 if (branch
->opcode
== aco_opcode::p_branch
) {
995 branch
->target
[0] = block
->linear_succs
[0];
997 branch
->target
[0] = block
->linear_succs
[1];
998 branch
->target
[1] = block
->linear_succs
[0];
1003 if (block
->kind
& block_kind_branch
) {
1005 if (ctx
.handle_wqm
&&
1006 ctx
.info
[idx
].exec
.size() >= 2 &&
1007 ctx
.info
[idx
].exec
.back().second
== mask_type_exact
&&
1008 !(ctx
.info
[idx
].block_needs
& Exact_Branch
) &&
1009 ctx
.info
[idx
].exec
[ctx
.info
[idx
].exec
.size() - 2].second
& mask_type_wqm
) {
1010 /* return to wqm before branching */
1011 ctx
.info
[idx
].exec
.pop_back();
1014 // orig = s_and_saveexec_b64
1015 assert(block
->linear_succs
.size() == 2);
1016 assert(block
->instructions
.back()->opcode
== aco_opcode::p_cbranch_z
);
1017 Temp cond
= block
->instructions
.back()->operands
[0].getTemp();
1018 block
->instructions
.pop_back();
1020 if (ctx
.info
[idx
].block_needs
& Exact_Branch
)
1021 transition_to_Exact(ctx
, bld
, idx
);
1023 Temp current_exec
= ctx
.info
[idx
].exec
.back().first
;
1024 uint8_t mask_type
= ctx
.info
[idx
].exec
.back().second
& (mask_type_wqm
| mask_type_exact
);
1026 Temp then_mask
= bld
.tmp(bld
.lm
);
1027 Temp old_exec
= bld
.sop1(Builder::s_and_saveexec
, bld
.def(bld
.lm
), bld
.def(s1
, scc
),
1028 bld
.exec(Definition(then_mask
)), cond
, bld
.exec(current_exec
));
1030 ctx
.info
[idx
].exec
.back().first
= old_exec
;
1032 /* add next current exec to the stack */
1033 ctx
.info
[idx
].exec
.emplace_back(then_mask
, mask_type
);
1035 bld
.branch(aco_opcode::p_cbranch_z
, bld
.hint_vcc(bld
.def(s2
)), bld
.exec(then_mask
), block
->linear_succs
[1], block
->linear_succs
[0]);
1039 if (block
->kind
& block_kind_invert
) {
1040 // exec = s_andn2_b64 (original_exec, exec)
1041 assert(block
->instructions
.back()->opcode
== aco_opcode::p_cbranch_nz
);
1042 block
->instructions
.pop_back();
1043 Temp then_mask
= ctx
.info
[idx
].exec
.back().first
;
1044 uint8_t mask_type
= ctx
.info
[idx
].exec
.back().second
;
1045 ctx
.info
[idx
].exec
.pop_back();
1046 Temp orig_exec
= ctx
.info
[idx
].exec
.back().first
;
1047 Temp else_mask
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
, exec
),
1048 bld
.def(s1
, scc
), orig_exec
, bld
.exec(then_mask
));
1050 /* add next current exec to the stack */
1051 ctx
.info
[idx
].exec
.emplace_back(else_mask
, mask_type
);
1053 bld
.branch(aco_opcode::p_cbranch_z
, bld
.hint_vcc(bld
.def(s2
)), bld
.exec(else_mask
), block
->linear_succs
[1], block
->linear_succs
[0]);
1057 if (block
->kind
& block_kind_break
) {
1058 // loop_mask = s_andn2_b64 (loop_mask, exec)
1059 assert(block
->instructions
.back()->opcode
== aco_opcode::p_branch
);
1060 block
->instructions
.pop_back();
1063 for (int exec_idx
= ctx
.info
[idx
].exec
.size() - 2; exec_idx
>= 0; exec_idx
--) {
1065 Temp exec_mask
= ctx
.info
[idx
].exec
[exec_idx
].first
;
1066 exec_mask
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.scc(Definition(cond
)),
1067 exec_mask
, break_cond
);
1068 ctx
.info
[idx
].exec
[exec_idx
].first
= exec_mask
;
1069 if (ctx
.info
[idx
].exec
[exec_idx
].second
& mask_type_loop
)
1073 /* check if the successor is the merge block, otherwise set exec to 0 */
1074 // TODO: this could be done better by directly branching to the merge block
1075 unsigned succ_idx
= ctx
.program
->blocks
[block
->linear_succs
[1]].linear_succs
[0];
1076 Block
& succ
= ctx
.program
->blocks
[succ_idx
];
1077 if (!(succ
.kind
& block_kind_invert
|| succ
.kind
& block_kind_merge
)) {
1078 ctx
.info
[idx
].exec
.back().first
= bld
.sop1(Builder::s_mov
, bld
.def(bld
.lm
, exec
), Operand(0u));
1081 bld
.branch(aco_opcode::p_cbranch_nz
, bld
.hint_vcc(bld
.def(s2
)), bld
.scc(cond
), block
->linear_succs
[1], block
->linear_succs
[0]);
1085 if (block
->kind
& block_kind_continue
) {
1086 assert(block
->instructions
.back()->opcode
== aco_opcode::p_branch
);
1087 block
->instructions
.pop_back();
1089 Temp current_exec
= ctx
.info
[idx
].exec
.back().first
;
1091 for (int exec_idx
= ctx
.info
[idx
].exec
.size() - 2; exec_idx
>= 0; exec_idx
--) {
1092 if (ctx
.info
[idx
].exec
[exec_idx
].second
& mask_type_loop
)
1095 Temp exec_mask
= ctx
.info
[idx
].exec
[exec_idx
].first
;
1096 exec_mask
= bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.scc(Definition(cond
)),
1097 exec_mask
, bld
.exec(current_exec
));
1098 ctx
.info
[idx
].exec
[exec_idx
].first
= exec_mask
;
1100 assert(cond
!= Temp());
1102 /* check if the successor is the merge block, otherwise set exec to 0 */
1103 // TODO: this could be done better by directly branching to the merge block
1104 unsigned succ_idx
= ctx
.program
->blocks
[block
->linear_succs
[1]].linear_succs
[0];
1105 Block
& succ
= ctx
.program
->blocks
[succ_idx
];
1106 if (!(succ
.kind
& block_kind_invert
|| succ
.kind
& block_kind_merge
)) {
1107 ctx
.info
[idx
].exec
.back().first
= bld
.sop1(Builder::s_mov
, bld
.def(bld
.lm
, exec
), Operand(0u));
1110 bld
.branch(aco_opcode::p_cbranch_nz
, bld
.hint_vcc(bld
.def(s2
)), bld
.scc(cond
), block
->linear_succs
[1], block
->linear_succs
[0]);
1115 void process_block(exec_ctx
& ctx
, Block
* block
)
1117 std::vector
<aco_ptr
<Instruction
>> instructions
;
1118 instructions
.reserve(block
->instructions
.size());
1120 unsigned idx
= add_coupling_code(ctx
, block
, instructions
);
1122 assert(block
->index
!= ctx
.program
->blocks
.size() - 1 ||
1123 ctx
.info
[block
->index
].exec
.size() <= 2);
1125 process_instructions(ctx
, block
, instructions
, idx
);
1127 block
->instructions
= std::move(instructions
);
1129 add_branch_code(ctx
, block
);
1131 block
->live_out_exec
= ctx
.info
[block
->index
].exec
.back().first
;
1134 } /* end namespace */
1137 void insert_exec_mask(Program
*program
)
1139 exec_ctx
ctx(program
);
1141 if (program
->needs_wqm
&& program
->needs_exact
)
1142 calculate_wqm_needs(ctx
);
1144 for (Block
& block
: program
->blocks
)
1145 process_block(ctx
, &block
);