2 * Copyright © 2020 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "vulkan/radv_shader.h"
26 #include "c11/threads.h"
27 #include "util/debug.h"
31 uint64_t debug_flags
= 0;
33 static const struct debug_control aco_debug_options
[] = {
34 {"validateir", DEBUG_VALIDATE
},
35 {"validatera", DEBUG_VALIDATE_RA
},
36 {"perfwarn", DEBUG_PERFWARN
},
40 static once_flag init_once_flag
= ONCE_FLAG_INIT
;
42 static void init_once()
44 debug_flags
= parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options
);
47 /* enable some flags by default on debug builds */
48 debug_flags
|= aco::DEBUG_VALIDATE
;
54 call_once(&init_once_flag
, init_once
);
57 void init_program(Program
*program
, Stage stage
, struct radv_shader_info
*info
,
58 enum chip_class chip_class
, enum radeon_family family
,
59 ac_shader_config
*config
)
61 program
->stage
= stage
;
62 program
->config
= config
;
64 program
->chip_class
= chip_class
;
65 if (family
== CHIP_UNKNOWN
) {
68 program
->family
= CHIP_TAHITI
;
71 program
->family
= CHIP_BONAIRE
;
74 program
->family
= CHIP_POLARIS10
;
77 program
->family
= CHIP_VEGA10
;
80 program
->family
= CHIP_NAVI10
;
83 program
->family
= CHIP_UNKNOWN
;
87 program
->family
= family
;
89 program
->wave_size
= info
->wave_size
;
90 program
->lane_mask
= program
->wave_size
== 32 ? s1
: s2
;
92 program
->lds_alloc_granule
= chip_class
>= GFX7
? 512 : 256;
93 program
->lds_limit
= chip_class
>= GFX7
? 65536 : 32768;
94 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
95 program
->has_16bank_lds
= family
== CHIP_KABINI
|| family
== CHIP_STONEY
;
97 program
->vgpr_limit
= 256;
98 program
->vgpr_alloc_granule
= 3;
100 if (chip_class
>= GFX10
) {
101 program
->physical_sgprs
= 2560; /* doesn't matter as long as it's at least 128 * 20 */
102 program
->sgpr_alloc_granule
= 127;
103 program
->sgpr_limit
= 106;
104 program
->vgpr_alloc_granule
= program
->wave_size
== 32 ? 7 : 3;
105 } else if (program
->chip_class
>= GFX8
) {
106 program
->physical_sgprs
= 800;
107 program
->sgpr_alloc_granule
= 15;
108 if (family
== CHIP_TONGA
|| family
== CHIP_ICELAND
)
109 program
->sgpr_limit
= 94; /* workaround hardware bug */
111 program
->sgpr_limit
= 102;
113 program
->physical_sgprs
= 512;
114 program
->sgpr_alloc_granule
= 7;
115 program
->sgpr_limit
= 104;
118 program
->next_fp_mode
.preserve_signed_zero_inf_nan32
= false;
119 program
->next_fp_mode
.preserve_signed_zero_inf_nan16_64
= false;
120 program
->next_fp_mode
.must_flush_denorms32
= false;
121 program
->next_fp_mode
.must_flush_denorms16_64
= false;
122 program
->next_fp_mode
.care_about_round32
= false;
123 program
->next_fp_mode
.care_about_round16_64
= false;
124 program
->next_fp_mode
.denorm16_64
= fp_denorm_keep
;
125 program
->next_fp_mode
.denorm32
= 0;
126 program
->next_fp_mode
.round16_64
= fp_round_ne
;
127 program
->next_fp_mode
.round32
= fp_round_ne
;
130 bool can_use_SDWA(chip_class chip
, const aco_ptr
<Instruction
>& instr
)
132 if (!instr
->isVALU())
135 if (chip
< GFX8
|| instr
->isDPP())
141 if (instr
->isVOP3()) {
142 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
143 if (instr
->format
== Format::VOP3
)
145 if (vop3
->clamp
&& instr
->format
== asVOP3(Format::VOPC
) && chip
!= GFX8
)
147 if (vop3
->omod
&& chip
< GFX9
)
150 //TODO: return true if we know we will use vcc
151 if (instr
->definitions
.size() >= 2)
154 for (unsigned i
= 1; i
< instr
->operands
.size(); i
++) {
155 if (instr
->operands
[i
].isLiteral())
157 if (chip
< GFX9
&& !instr
->operands
[i
].isOfType(RegType::vgpr
))
162 if (!instr
->operands
.empty()) {
163 if (instr
->operands
[0].isLiteral())
165 if (chip
< GFX9
&& !instr
->operands
[0].isOfType(RegType::vgpr
))
169 bool is_mac
= instr
->opcode
== aco_opcode::v_mac_f32
||
170 instr
->opcode
== aco_opcode::v_mac_f16
||
171 instr
->opcode
== aco_opcode::v_fmac_f32
||
172 instr
->opcode
== aco_opcode::v_fmac_f16
;
174 if (chip
!= GFX8
&& is_mac
)
177 //TODO: return true if we know we will use vcc
178 if ((unsigned)instr
->format
& (unsigned)Format::VOPC
)
180 if (instr
->operands
.size() >= 3 && !is_mac
)
183 return instr
->opcode
!= aco_opcode::v_madmk_f32
&&
184 instr
->opcode
!= aco_opcode::v_madak_f32
&&
185 instr
->opcode
!= aco_opcode::v_madmk_f16
&&
186 instr
->opcode
!= aco_opcode::v_madak_f16
&&
187 instr
->opcode
!= aco_opcode::v_readfirstlane_b32
&&
188 instr
->opcode
!= aco_opcode::v_clrexcp
&&
189 instr
->opcode
!= aco_opcode::v_swap_b32
;
192 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
193 aco_ptr
<Instruction
> convert_to_SDWA(chip_class chip
, aco_ptr
<Instruction
>& instr
)
198 aco_ptr
<Instruction
> tmp
= std::move(instr
);
199 Format format
= (Format
)(((uint16_t)tmp
->format
& ~(uint16_t)Format::VOP3
) | (uint16_t)Format::SDWA
);
200 instr
.reset(create_instruction
<SDWA_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
201 std::copy(tmp
->operands
.cbegin(), tmp
->operands
.cend(), instr
->operands
.begin());
202 std::copy(tmp
->definitions
.cbegin(), tmp
->definitions
.cend(), instr
->definitions
.begin());
204 SDWA_instruction
*sdwa
= static_cast<SDWA_instruction
*>(instr
.get());
207 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(tmp
.get());
208 memcpy(sdwa
->neg
, vop3
->neg
, sizeof(sdwa
->neg
));
209 memcpy(sdwa
->abs
, vop3
->abs
, sizeof(sdwa
->abs
));
210 sdwa
->omod
= vop3
->omod
;
211 sdwa
->clamp
= vop3
->clamp
;
214 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
215 switch (instr
->operands
[i
].bytes()) {
217 sdwa
->sel
[i
] = sdwa_ubyte
;
220 sdwa
->sel
[i
] = sdwa_uword
;
223 sdwa
->sel
[i
] = sdwa_udword
;
227 switch (instr
->definitions
[0].bytes()) {
229 sdwa
->dst_sel
= sdwa_ubyte
;
230 sdwa
->dst_preserve
= true;
233 sdwa
->dst_sel
= sdwa_uword
;
234 sdwa
->dst_preserve
= true;
237 sdwa
->dst_sel
= sdwa_udword
;
241 if (instr
->definitions
[0].getTemp().type() == RegType::sgpr
&& chip
== GFX8
)
242 instr
->definitions
[0].setFixed(vcc
);
243 if (instr
->definitions
.size() >= 2)
244 instr
->definitions
[1].setFixed(vcc
);
245 if (instr
->operands
.size() >= 3)
246 instr
->operands
[2].setFixed(vcc
);
251 bool can_use_opsel(chip_class chip
, aco_opcode op
, int idx
, bool high
)
253 /* opsel is only GFX9+ */
254 if ((high
|| idx
== -1) && chip
< GFX9
)
258 case aco_opcode::v_div_fixup_f16
:
259 case aco_opcode::v_fma_f16
:
260 case aco_opcode::v_mad_f16
:
261 case aco_opcode::v_mad_u16
:
262 case aco_opcode::v_mad_i16
:
263 case aco_opcode::v_med3_f16
:
264 case aco_opcode::v_med3_i16
:
265 case aco_opcode::v_med3_u16
:
266 case aco_opcode::v_min3_f16
:
267 case aco_opcode::v_min3_i16
:
268 case aco_opcode::v_min3_u16
:
269 case aco_opcode::v_max3_f16
:
270 case aco_opcode::v_max3_i16
:
271 case aco_opcode::v_max3_u16
:
272 case aco_opcode::v_max_u16_e64
:
273 case aco_opcode::v_max_i16_e64
:
274 case aco_opcode::v_min_u16_e64
:
275 case aco_opcode::v_min_i16_e64
:
276 case aco_opcode::v_add_i16
:
277 case aco_opcode::v_sub_i16
:
278 case aco_opcode::v_add_u16_e64
:
279 case aco_opcode::v_sub_u16_e64
:
280 case aco_opcode::v_cvt_pknorm_i16_f16
:
281 case aco_opcode::v_cvt_pknorm_u16_f16
:
282 case aco_opcode::v_lshlrev_b16_e64
:
283 case aco_opcode::v_lshrrev_b16_e64
:
284 case aco_opcode::v_ashrrev_i16_e64
:
285 case aco_opcode::v_mul_lo_u16_e64
:
287 case aco_opcode::v_pack_b32_f16
:
289 case aco_opcode::v_mad_u32_u16
:
290 case aco_opcode::v_mad_i32_i16
:
291 return idx
>= 0 && idx
< 2;