2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include <unordered_map>
30 * Implements the algorithm for dominator-tree value numbering
31 * from "Value Numbering" by Briggs, Cooper, and Simpson.
38 uint32_t murmur_32_scramble(uint32_t h
, uint32_t k
) {
40 k
= (k
<< 15) | (k
>> 17);
42 h
= (h
<< 13) | (h
>> 19);
43 h
= h
* 5 + 0xe6546b64;
48 uint32_t hash_murmur_32(Instruction
* instr
)
50 uint32_t hash
= uint32_t(instr
->format
) << 16 | uint32_t(instr
->opcode
);
52 for (const Operand
& op
: instr
->operands
)
53 hash
= murmur_32_scramble(hash
, op
.constantValue());
55 /* skip format, opcode and pass_flags */
56 for (unsigned i
= 2; i
< (sizeof(T
) >> 2); i
++) {
58 /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */
59 memcpy(&u
, reinterpret_cast<uint8_t *>(instr
) + i
* 4, 4);
60 hash
= murmur_32_scramble(hash
, u
);
64 uint32_t len
= instr
->operands
.size() + instr
->definitions
.size() + sizeof(T
);
75 /* This hash function uses the Murmur3 algorithm written by Austin Appleby
76 * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
78 * In order to calculate the expression set, only the right-hand-side of an
79 * instruction is used for the hash, i.e. everything except the definitions.
81 std::size_t operator()(Instruction
* instr
) const
84 return hash_murmur_32
<VOP3A_instruction
>(instr
);
87 return hash_murmur_32
<DPP_instruction
>(instr
);
90 return hash_murmur_32
<SDWA_instruction
>(instr
);
92 switch (instr
->format
) {
94 return hash_murmur_32
<SMEM_instruction
>(instr
);
96 return hash_murmur_32
<Interp_instruction
>(instr
);
98 return hash_murmur_32
<DS_instruction
>(instr
);
100 return hash_murmur_32
<SOPP_instruction
>(instr
);
102 return hash_murmur_32
<SOPK_instruction
>(instr
);
104 return hash_murmur_32
<Export_instruction
>(instr
);
106 return hash_murmur_32
<MUBUF_instruction
>(instr
);
108 return hash_murmur_32
<MIMG_instruction
>(instr
);
110 return hash_murmur_32
<MTBUF_instruction
>(instr
);
112 return hash_murmur_32
<FLAT_instruction
>(instr
);
113 case Format::PSEUDO_BRANCH
:
114 return hash_murmur_32
<Pseudo_branch_instruction
>(instr
);
115 case Format::PSEUDO_REDUCTION
:
116 return hash_murmur_32
<Pseudo_reduction_instruction
>(instr
);
118 return hash_murmur_32
<Instruction
>(instr
);
124 bool operator()(Instruction
* a
, Instruction
* b
) const
126 if (a
->format
!= b
->format
)
128 if (a
->opcode
!= b
->opcode
)
130 if (a
->operands
.size() != b
->operands
.size() || a
->definitions
.size() != b
->definitions
.size())
131 return false; /* possible with pseudo-instructions */
132 for (unsigned i
= 0; i
< a
->operands
.size(); i
++) {
133 if (a
->operands
[i
].isConstant()) {
134 if (!b
->operands
[i
].isConstant())
136 if (a
->operands
[i
].constantValue() != b
->operands
[i
].constantValue())
139 else if (a
->operands
[i
].isTemp()) {
140 if (!b
->operands
[i
].isTemp())
142 if (a
->operands
[i
].tempId() != b
->operands
[i
].tempId())
145 else if (a
->operands
[i
].isUndefined() ^ b
->operands
[i
].isUndefined())
147 if (a
->operands
[i
].isFixed()) {
148 if (!b
->operands
[i
].isFixed())
150 if (a
->operands
[i
].physReg() != b
->operands
[i
].physReg())
152 if (a
->operands
[i
].physReg() == exec
&& a
->pass_flags
!= b
->pass_flags
)
156 for (unsigned i
= 0; i
< a
->definitions
.size(); i
++) {
157 if (a
->definitions
[i
].isTemp()) {
158 if (!b
->definitions
[i
].isTemp())
160 if (a
->definitions
[i
].regClass() != b
->definitions
[i
].regClass())
163 if (a
->definitions
[i
].isFixed()) {
164 if (!b
->definitions
[i
].isFixed())
166 if (a
->definitions
[i
].physReg() != b
->definitions
[i
].physReg())
168 if (a
->definitions
[i
].physReg() == exec
)
173 if (a
->opcode
== aco_opcode::v_readfirstlane_b32
)
174 return a
->pass_flags
== b
->pass_flags
;
176 /* The results of VOPC depend on the exec mask if used for subgroup operations. */
177 if ((uint32_t) a
->format
& (uint32_t) Format::VOPC
&& a
->pass_flags
!= b
->pass_flags
)
181 VOP3A_instruction
* a3
= static_cast<VOP3A_instruction
*>(a
);
182 VOP3A_instruction
* b3
= static_cast<VOP3A_instruction
*>(b
);
183 for (unsigned i
= 0; i
< 3; i
++) {
184 if (a3
->abs
[i
] != b3
->abs
[i
] ||
185 a3
->neg
[i
] != b3
->neg
[i
])
188 return a3
->clamp
== b3
->clamp
&&
189 a3
->omod
== b3
->omod
&&
190 a3
->opsel
== b3
->opsel
;
193 DPP_instruction
* aDPP
= static_cast<DPP_instruction
*>(a
);
194 DPP_instruction
* bDPP
= static_cast<DPP_instruction
*>(b
);
195 return aDPP
->pass_flags
== bDPP
->pass_flags
&&
196 aDPP
->dpp_ctrl
== bDPP
->dpp_ctrl
&&
197 aDPP
->bank_mask
== bDPP
->bank_mask
&&
198 aDPP
->row_mask
== bDPP
->row_mask
&&
199 aDPP
->bound_ctrl
== bDPP
->bound_ctrl
&&
200 aDPP
->abs
[0] == bDPP
->abs
[0] &&
201 aDPP
->abs
[1] == bDPP
->abs
[1] &&
202 aDPP
->neg
[0] == bDPP
->neg
[0] &&
203 aDPP
->neg
[1] == bDPP
->neg
[1];
206 SDWA_instruction
* aSDWA
= static_cast<SDWA_instruction
*>(a
);
207 SDWA_instruction
* bSDWA
= static_cast<SDWA_instruction
*>(b
);
208 return aSDWA
->sel
[0] == bSDWA
->sel
[0] &&
209 aSDWA
->sel
[1] == bSDWA
->sel
[1] &&
210 aSDWA
->dst_sel
== bSDWA
->dst_sel
&&
211 aSDWA
->abs
[0] == bSDWA
->abs
[0] &&
212 aSDWA
->abs
[1] == bSDWA
->abs
[1] &&
213 aSDWA
->neg
[0] == bSDWA
->neg
[0] &&
214 aSDWA
->neg
[1] == bSDWA
->neg
[1] &&
215 aSDWA
->dst_preserve
== bSDWA
->dst_preserve
&&
216 aSDWA
->clamp
== bSDWA
->clamp
&&
217 aSDWA
->omod
== bSDWA
->omod
;
222 SOPK_instruction
* aK
= static_cast<SOPK_instruction
*>(a
);
223 SOPK_instruction
* bK
= static_cast<SOPK_instruction
*>(b
);
224 return aK
->imm
== bK
->imm
;
227 if (!a
->operands
.empty() && a
->operands
[0].bytes() == 16)
229 SMEM_instruction
* aS
= static_cast<SMEM_instruction
*>(a
);
230 SMEM_instruction
* bS
= static_cast<SMEM_instruction
*>(b
);
231 /* isel shouldn't be creating situations where this assertion fails */
232 assert(aS
->prevent_overflow
== bS
->prevent_overflow
);
233 return aS
->sync
.can_reorder() && bS
->sync
.can_reorder() &&
234 aS
->sync
== bS
->sync
&& aS
->glc
== bS
->glc
&& aS
->dlc
== bS
->dlc
&&
235 aS
->nv
== bS
->nv
&& aS
->disable_wqm
== bS
->disable_wqm
&&
236 aS
->prevent_overflow
== bS
->prevent_overflow
;
238 case Format::VINTRP
: {
239 Interp_instruction
* aI
= static_cast<Interp_instruction
*>(a
);
240 Interp_instruction
* bI
= static_cast<Interp_instruction
*>(b
);
241 if (aI
->attribute
!= bI
->attribute
)
243 if (aI
->component
!= bI
->component
)
247 case Format::PSEUDO_REDUCTION
: {
248 Pseudo_reduction_instruction
*aR
= static_cast<Pseudo_reduction_instruction
*>(a
);
249 Pseudo_reduction_instruction
*bR
= static_cast<Pseudo_reduction_instruction
*>(b
);
250 return aR
->pass_flags
== bR
->pass_flags
&&
251 aR
->reduce_op
== bR
->reduce_op
&&
252 aR
->cluster_size
== bR
->cluster_size
;
254 case Format::MTBUF
: {
255 MTBUF_instruction
* aM
= static_cast<MTBUF_instruction
*>(a
);
256 MTBUF_instruction
* bM
= static_cast<MTBUF_instruction
*>(b
);
257 return aM
->sync
.can_reorder() && bM
->sync
.can_reorder() &&
258 aM
->sync
== bM
->sync
&&
259 aM
->dfmt
== bM
->dfmt
&&
260 aM
->nfmt
== bM
->nfmt
&&
261 aM
->offset
== bM
->offset
&&
262 aM
->offen
== bM
->offen
&&
263 aM
->idxen
== bM
->idxen
&&
264 aM
->glc
== bM
->glc
&&
265 aM
->dlc
== bM
->dlc
&&
266 aM
->slc
== bM
->slc
&&
267 aM
->tfe
== bM
->tfe
&&
268 aM
->disable_wqm
== bM
->disable_wqm
;
270 case Format::MUBUF
: {
271 MUBUF_instruction
* aM
= static_cast<MUBUF_instruction
*>(a
);
272 MUBUF_instruction
* bM
= static_cast<MUBUF_instruction
*>(b
);
273 return aM
->sync
.can_reorder() && bM
->sync
.can_reorder() &&
274 aM
->sync
== bM
->sync
&&
275 aM
->offset
== bM
->offset
&&
276 aM
->offen
== bM
->offen
&&
277 aM
->idxen
== bM
->idxen
&&
278 aM
->glc
== bM
->glc
&&
279 aM
->dlc
== bM
->dlc
&&
280 aM
->slc
== bM
->slc
&&
281 aM
->tfe
== bM
->tfe
&&
282 aM
->lds
== bM
->lds
&&
283 aM
->disable_wqm
== bM
->disable_wqm
;
285 /* we want to optimize these in NIR and don't hassle with load-store dependencies */
288 case Format::SCRATCH
:
291 case Format::PSEUDO_BRANCH
:
292 case Format::PSEUDO_BARRIER
:
295 if (a
->opcode
!= aco_opcode::ds_bpermute_b32
&&
296 a
->opcode
!= aco_opcode::ds_permute_b32
&&
297 a
->opcode
!= aco_opcode::ds_swizzle_b32
)
299 DS_instruction
* aD
= static_cast<DS_instruction
*>(a
);
300 DS_instruction
* bD
= static_cast<DS_instruction
*>(b
);
301 return aD
->sync
.can_reorder() && bD
->sync
.can_reorder() &&
302 aD
->sync
== bD
->sync
&&
303 aD
->pass_flags
== bD
->pass_flags
&&
304 aD
->gds
== bD
->gds
&&
305 aD
->offset0
== bD
->offset0
&&
306 aD
->offset1
== bD
->offset1
;
309 MIMG_instruction
* aM
= static_cast<MIMG_instruction
*>(a
);
310 MIMG_instruction
* bM
= static_cast<MIMG_instruction
*>(b
);
311 return aM
->sync
.can_reorder() && bM
->sync
.can_reorder() &&
312 aM
->sync
== bM
->sync
&&
313 aM
->dmask
== bM
->dmask
&&
314 aM
->unrm
== bM
->unrm
&&
315 aM
->glc
== bM
->glc
&&
316 aM
->slc
== bM
->slc
&&
317 aM
->tfe
== bM
->tfe
&&
319 aM
->lwe
== bM
->lwe
&&
320 aM
->r128
== bM
->r128
&&
321 aM
->a16
== bM
->a16
&&
322 aM
->d16
== bM
->d16
&&
323 aM
->disable_wqm
== bM
->disable_wqm
;
331 using expr_set
= std::unordered_map
<Instruction
*, uint32_t, InstrHash
, InstrPred
>;
335 expr_set expr_values
;
336 std::map
<uint32_t, Temp
> renames
;
338 /* The exec id should be the same on the same level of control flow depth.
339 * Together with the check for dominator relations, it is safe to assume
340 * that the same exec_id also means the same execution mask.
341 * Discards increment the exec_id, so that it won't return to the previous value.
343 uint32_t exec_id
= 1;
345 vn_ctx(Program
* program
) : program(program
) {
346 static_assert(sizeof(Temp
) == 4, "Temp must fit in 32bits");
348 for (Block
& block
: program
->blocks
)
349 size
+= block
.instructions
.size();
350 expr_values
.reserve(size
);
355 /* dominates() returns true if the parent block dominates the child block and
356 * if the parent block is part of the same loop or has a smaller loop nest depth.
358 bool dominates(vn_ctx
& ctx
, uint32_t parent
, uint32_t child
)
360 unsigned parent_loop_nest_depth
= ctx
.program
->blocks
[parent
].loop_nest_depth
;
361 while (parent
< child
&& parent_loop_nest_depth
<= ctx
.program
->blocks
[child
].loop_nest_depth
)
362 child
= ctx
.program
->blocks
[child
].logical_idom
;
364 return parent
== child
;
367 void process_block(vn_ctx
& ctx
, Block
& block
)
369 std::vector
<aco_ptr
<Instruction
>> new_instructions
;
370 new_instructions
.reserve(block
.instructions
.size());
372 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
373 /* first, rename operands */
374 for (Operand
& op
: instr
->operands
) {
377 auto it
= ctx
.renames
.find(op
.tempId());
378 if (it
!= ctx
.renames
.end())
379 op
.setTemp(it
->second
);
382 if (instr
->opcode
== aco_opcode::p_discard_if
||
383 instr
->opcode
== aco_opcode::p_demote_to_helper
)
386 if (instr
->definitions
.empty() || instr
->opcode
== aco_opcode::p_phi
|| instr
->opcode
== aco_opcode::p_linear_phi
) {
387 new_instructions
.emplace_back(std::move(instr
));
391 /* simple copy-propagation through renaming */
392 if ((instr
->opcode
== aco_opcode::s_mov_b32
|| instr
->opcode
== aco_opcode::s_mov_b64
|| instr
->opcode
== aco_opcode::v_mov_b32
) &&
393 !instr
->definitions
[0].isFixed() && instr
->operands
[0].isTemp() && instr
->operands
[0].regClass() == instr
->definitions
[0].regClass() &&
394 !instr
->isDPP() && !((int)instr
->format
& (int)Format::SDWA
)) {
395 ctx
.renames
[instr
->definitions
[0].tempId()] = instr
->operands
[0].getTemp();
398 instr
->pass_flags
= ctx
.exec_id
;
399 std::pair
<expr_set::iterator
, bool> res
= ctx
.expr_values
.emplace(instr
.get(), block
.index
);
401 /* if there was already an expression with the same value number */
403 Instruction
* orig_instr
= res
.first
->first
;
404 assert(instr
->definitions
.size() == orig_instr
->definitions
.size());
405 /* check if the original instruction dominates the current one */
406 if (dominates(ctx
, res
.first
->second
, block
.index
) &&
407 ctx
.program
->blocks
[res
.first
->second
].fp_mode
.canReplace(block
.fp_mode
)) {
408 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
409 assert(instr
->definitions
[i
].regClass() == orig_instr
->definitions
[i
].regClass());
410 assert(instr
->definitions
[i
].isTemp());
411 ctx
.renames
[instr
->definitions
[i
].tempId()] = orig_instr
->definitions
[i
].getTemp();
412 if (instr
->definitions
[i
].isPrecise())
413 orig_instr
->definitions
[i
].setPrecise(true);
414 /* SPIR_V spec says that an instruction marked with NUW wrapping
415 * around is undefined behaviour, so we can break additions in
418 if (instr
->definitions
[i
].isNUW())
419 orig_instr
->definitions
[i
].setNUW(true);
422 ctx
.expr_values
.erase(res
.first
);
423 ctx
.expr_values
.emplace(instr
.get(), block
.index
);
424 new_instructions
.emplace_back(std::move(instr
));
427 new_instructions
.emplace_back(std::move(instr
));
431 block
.instructions
= std::move(new_instructions
);
434 void rename_phi_operands(Block
& block
, std::map
<uint32_t, Temp
>& renames
)
436 for (aco_ptr
<Instruction
>& phi
: block
.instructions
) {
437 if (phi
->opcode
!= aco_opcode::p_phi
&& phi
->opcode
!= aco_opcode::p_linear_phi
)
440 for (Operand
& op
: phi
->operands
) {
443 auto it
= renames
.find(op
.tempId());
444 if (it
!= renames
.end())
445 op
.setTemp(it
->second
);
449 } /* end namespace */
452 void value_numbering(Program
* program
)
455 std::vector
<unsigned> loop_headers
;
457 for (Block
& block
: program
->blocks
) {
458 assert(ctx
.exec_id
> 0);
459 /* decrement exec_id when leaving nested control flow */
460 if (block
.kind
& block_kind_loop_header
)
461 loop_headers
.push_back(block
.index
);
462 if (block
.kind
& block_kind_merge
) {
464 } else if (block
.kind
& block_kind_loop_exit
) {
465 ctx
.exec_id
-= program
->blocks
[loop_headers
.back()].linear_preds
.size();
466 ctx
.exec_id
-= block
.linear_preds
.size();
467 loop_headers
.pop_back();
470 if (block
.logical_idom
!= -1)
471 process_block(ctx
, block
);
473 rename_phi_operands(block
, ctx
.renames
);
475 /* increment exec_id when entering nested control flow */
476 if (block
.kind
& block_kind_branch
||
477 block
.kind
& block_kind_loop_preheader
||
478 block
.kind
& block_kind_break
||
479 block
.kind
& block_kind_continue
||
480 block
.kind
& block_kind_discard
)
482 else if (block
.kind
& block_kind_continue_or_break
)
486 /* rename loop header phi operands */
487 for (Block
& block
: program
->blocks
) {
488 if (block
.kind
& block_kind_loop_header
)
489 rename_phi_operands(block
, ctx
.renames
);