2 * Copyright © 2020 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "vulkan/radv_shader.h"
26 #include "c11/threads.h"
27 #include "util/debug.h"
31 uint64_t debug_flags
= 0;
33 static const struct debug_control aco_debug_options
[] = {
34 {"validateir", DEBUG_VALIDATE_IR
},
35 {"validatera", DEBUG_VALIDATE_RA
},
36 {"perfwarn", DEBUG_PERFWARN
},
37 {"force-waitcnt", DEBUG_FORCE_WAITCNT
},
38 {"novn", DEBUG_NO_VN
},
39 {"noopt", DEBUG_NO_OPT
},
40 {"nosched", DEBUG_NO_SCHED
},
44 static once_flag init_once_flag
= ONCE_FLAG_INIT
;
46 static void init_once()
48 debug_flags
= parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options
);
51 /* enable some flags by default on debug builds */
52 debug_flags
|= aco::DEBUG_VALIDATE_IR
;
58 call_once(&init_once_flag
, init_once
);
61 void init_program(Program
*program
, Stage stage
, struct radv_shader_info
*info
,
62 enum chip_class chip_class
, enum radeon_family family
,
63 ac_shader_config
*config
)
65 program
->stage
= stage
;
66 program
->config
= config
;
68 program
->chip_class
= chip_class
;
69 if (family
== CHIP_UNKNOWN
) {
72 program
->family
= CHIP_TAHITI
;
75 program
->family
= CHIP_BONAIRE
;
78 program
->family
= CHIP_POLARIS10
;
81 program
->family
= CHIP_VEGA10
;
84 program
->family
= CHIP_NAVI10
;
87 program
->family
= CHIP_UNKNOWN
;
91 program
->family
= family
;
93 program
->wave_size
= info
->wave_size
;
94 program
->lane_mask
= program
->wave_size
== 32 ? s1
: s2
;
96 program
->lds_alloc_granule
= chip_class
>= GFX7
? 512 : 256;
97 program
->lds_limit
= chip_class
>= GFX7
? 65536 : 32768;
98 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
99 program
->has_16bank_lds
= family
== CHIP_KABINI
|| family
== CHIP_STONEY
;
101 program
->vgpr_limit
= 256;
102 program
->vgpr_alloc_granule
= 3;
104 if (chip_class
>= GFX10
) {
105 program
->physical_sgprs
= 2560; /* doesn't matter as long as it's at least 128 * 20 */
106 program
->sgpr_alloc_granule
= 127;
107 program
->sgpr_limit
= 106;
108 if (chip_class
>= GFX10_3
)
109 program
->vgpr_alloc_granule
= program
->wave_size
== 32 ? 15 : 7;
111 program
->vgpr_alloc_granule
= program
->wave_size
== 32 ? 7 : 3;
112 } else if (program
->chip_class
>= GFX8
) {
113 program
->physical_sgprs
= 800;
114 program
->sgpr_alloc_granule
= 15;
115 if (family
== CHIP_TONGA
|| family
== CHIP_ICELAND
)
116 program
->sgpr_limit
= 94; /* workaround hardware bug */
118 program
->sgpr_limit
= 102;
120 program
->physical_sgprs
= 512;
121 program
->sgpr_alloc_granule
= 7;
122 program
->sgpr_limit
= 104;
125 program
->next_fp_mode
.preserve_signed_zero_inf_nan32
= false;
126 program
->next_fp_mode
.preserve_signed_zero_inf_nan16_64
= false;
127 program
->next_fp_mode
.must_flush_denorms32
= false;
128 program
->next_fp_mode
.must_flush_denorms16_64
= false;
129 program
->next_fp_mode
.care_about_round32
= false;
130 program
->next_fp_mode
.care_about_round16_64
= false;
131 program
->next_fp_mode
.denorm16_64
= fp_denorm_keep
;
132 program
->next_fp_mode
.denorm32
= 0;
133 program
->next_fp_mode
.round16_64
= fp_round_ne
;
134 program
->next_fp_mode
.round32
= fp_round_ne
;
137 memory_sync_info
get_sync_info(const Instruction
* instr
)
139 switch (instr
->format
) {
141 return static_cast<const SMEM_instruction
*>(instr
)->sync
;
143 return static_cast<const MUBUF_instruction
*>(instr
)->sync
;
145 return static_cast<const MIMG_instruction
*>(instr
)->sync
;
147 return static_cast<const MTBUF_instruction
*>(instr
)->sync
;
150 case Format::SCRATCH
:
151 return static_cast<const FLAT_instruction
*>(instr
)->sync
;
153 return static_cast<const DS_instruction
*>(instr
)->sync
;
155 return memory_sync_info();
159 bool can_use_SDWA(chip_class chip
, const aco_ptr
<Instruction
>& instr
)
161 if (!instr
->isVALU())
164 if (chip
< GFX8
|| instr
->isDPP())
170 if (instr
->isVOP3()) {
171 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
172 if (instr
->format
== Format::VOP3
)
174 if (vop3
->clamp
&& instr
->format
== asVOP3(Format::VOPC
) && chip
!= GFX8
)
176 if (vop3
->omod
&& chip
< GFX9
)
179 //TODO: return true if we know we will use vcc
180 if (instr
->definitions
.size() >= 2)
183 for (unsigned i
= 1; i
< instr
->operands
.size(); i
++) {
184 if (instr
->operands
[i
].isLiteral())
186 if (chip
< GFX9
&& !instr
->operands
[i
].isOfType(RegType::vgpr
))
191 if (!instr
->operands
.empty()) {
192 if (instr
->operands
[0].isLiteral())
194 if (chip
< GFX9
&& !instr
->operands
[0].isOfType(RegType::vgpr
))
198 bool is_mac
= instr
->opcode
== aco_opcode::v_mac_f32
||
199 instr
->opcode
== aco_opcode::v_mac_f16
||
200 instr
->opcode
== aco_opcode::v_fmac_f32
||
201 instr
->opcode
== aco_opcode::v_fmac_f16
;
203 if (chip
!= GFX8
&& is_mac
)
206 //TODO: return true if we know we will use vcc
207 if ((unsigned)instr
->format
& (unsigned)Format::VOPC
)
209 if (instr
->operands
.size() >= 3 && !is_mac
)
212 return instr
->opcode
!= aco_opcode::v_madmk_f32
&&
213 instr
->opcode
!= aco_opcode::v_madak_f32
&&
214 instr
->opcode
!= aco_opcode::v_madmk_f16
&&
215 instr
->opcode
!= aco_opcode::v_madak_f16
&&
216 instr
->opcode
!= aco_opcode::v_readfirstlane_b32
&&
217 instr
->opcode
!= aco_opcode::v_clrexcp
&&
218 instr
->opcode
!= aco_opcode::v_swap_b32
;
221 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
222 aco_ptr
<Instruction
> convert_to_SDWA(chip_class chip
, aco_ptr
<Instruction
>& instr
)
227 aco_ptr
<Instruction
> tmp
= std::move(instr
);
228 Format format
= (Format
)(((uint16_t)tmp
->format
& ~(uint16_t)Format::VOP3
) | (uint16_t)Format::SDWA
);
229 instr
.reset(create_instruction
<SDWA_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
230 std::copy(tmp
->operands
.cbegin(), tmp
->operands
.cend(), instr
->operands
.begin());
231 std::copy(tmp
->definitions
.cbegin(), tmp
->definitions
.cend(), instr
->definitions
.begin());
233 SDWA_instruction
*sdwa
= static_cast<SDWA_instruction
*>(instr
.get());
236 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(tmp
.get());
237 memcpy(sdwa
->neg
, vop3
->neg
, sizeof(sdwa
->neg
));
238 memcpy(sdwa
->abs
, vop3
->abs
, sizeof(sdwa
->abs
));
239 sdwa
->omod
= vop3
->omod
;
240 sdwa
->clamp
= vop3
->clamp
;
243 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
244 /* SDWA only uses operands 0 and 1. */
248 switch (instr
->operands
[i
].bytes()) {
250 sdwa
->sel
[i
] = sdwa_ubyte
;
253 sdwa
->sel
[i
] = sdwa_uword
;
256 sdwa
->sel
[i
] = sdwa_udword
;
260 switch (instr
->definitions
[0].bytes()) {
262 sdwa
->dst_sel
= sdwa_ubyte
;
263 sdwa
->dst_preserve
= true;
266 sdwa
->dst_sel
= sdwa_uword
;
267 sdwa
->dst_preserve
= true;
270 sdwa
->dst_sel
= sdwa_udword
;
274 if (instr
->definitions
[0].getTemp().type() == RegType::sgpr
&& chip
== GFX8
)
275 instr
->definitions
[0].setFixed(vcc
);
276 if (instr
->definitions
.size() >= 2)
277 instr
->definitions
[1].setFixed(vcc
);
278 if (instr
->operands
.size() >= 3)
279 instr
->operands
[2].setFixed(vcc
);
284 bool can_use_opsel(chip_class chip
, aco_opcode op
, int idx
, bool high
)
286 /* opsel is only GFX9+ */
287 if ((high
|| idx
== -1) && chip
< GFX9
)
291 case aco_opcode::v_div_fixup_f16
:
292 case aco_opcode::v_fma_f16
:
293 case aco_opcode::v_mad_f16
:
294 case aco_opcode::v_mad_u16
:
295 case aco_opcode::v_mad_i16
:
296 case aco_opcode::v_med3_f16
:
297 case aco_opcode::v_med3_i16
:
298 case aco_opcode::v_med3_u16
:
299 case aco_opcode::v_min3_f16
:
300 case aco_opcode::v_min3_i16
:
301 case aco_opcode::v_min3_u16
:
302 case aco_opcode::v_max3_f16
:
303 case aco_opcode::v_max3_i16
:
304 case aco_opcode::v_max3_u16
:
305 case aco_opcode::v_max_u16_e64
:
306 case aco_opcode::v_max_i16_e64
:
307 case aco_opcode::v_min_u16_e64
:
308 case aco_opcode::v_min_i16_e64
:
309 case aco_opcode::v_add_i16
:
310 case aco_opcode::v_sub_i16
:
311 case aco_opcode::v_add_u16_e64
:
312 case aco_opcode::v_sub_u16_e64
:
313 case aco_opcode::v_cvt_pknorm_i16_f16
:
314 case aco_opcode::v_cvt_pknorm_u16_f16
:
315 case aco_opcode::v_lshlrev_b16_e64
:
316 case aco_opcode::v_lshrrev_b16_e64
:
317 case aco_opcode::v_ashrrev_i16_e64
:
318 case aco_opcode::v_mul_lo_u16_e64
:
320 case aco_opcode::v_pack_b32_f16
:
322 case aco_opcode::v_mad_u32_u16
:
323 case aco_opcode::v_mad_i32_i16
:
324 return idx
>= 0 && idx
< 2;