2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
26 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
31 #include "util/u_math.h"
36 #include "vulkan/radv_shader.h"
39 RegisterDemand
get_live_changes(aco_ptr
<Instruction
>& instr
)
41 RegisterDemand changes
;
42 for (const Definition
& def
: instr
->definitions
) {
43 if (!def
.isTemp() || def
.isKill())
45 changes
+= def
.getTemp();
48 for (const Operand
& op
: instr
->operands
) {
49 if (!op
.isTemp() || !op
.isFirstKill())
51 changes
-= op
.getTemp();
57 RegisterDemand
get_temp_registers(aco_ptr
<Instruction
>& instr
)
59 RegisterDemand temp_registers
;
61 for (Definition def
: instr
->definitions
) {
65 temp_registers
+= def
.getTemp();
68 for (Operand op
: instr
->operands
) {
69 if (op
.isTemp() && op
.isLateKill() && op
.isFirstKill())
70 temp_registers
+= op
.getTemp();
73 return temp_registers
;
76 RegisterDemand
get_demand_before(RegisterDemand demand
, aco_ptr
<Instruction
>& instr
, aco_ptr
<Instruction
>& instr_before
)
78 demand
-= get_live_changes(instr
);
79 demand
-= get_temp_registers(instr
);
81 demand
+= get_temp_registers(instr_before
);
86 void process_live_temps_per_block(Program
*program
, live
& lives
, Block
* block
,
87 std::set
<unsigned>& worklist
, std::vector
<uint16_t>& phi_sgpr_ops
)
89 std::vector
<RegisterDemand
>& register_demand
= lives
.register_demand
[block
->index
];
90 RegisterDemand new_demand
;
92 register_demand
.resize(block
->instructions
.size());
93 block
->register_demand
= RegisterDemand();
94 TempSet live
= lives
.live_out
[block
->index
];
96 /* add the live_out_exec to live */
97 bool exec_live
= false;
98 if (block
->live_out_exec
!= Temp()) {
99 live
.insert(block
->live_out_exec
);
103 /* initialize register demand */
106 new_demand
.sgpr
-= phi_sgpr_ops
[block
->index
];
108 /* traverse the instructions backwards */
110 for (idx
= block
->instructions
.size() -1; idx
>= 0; idx
--) {
111 Instruction
*insn
= block
->instructions
[idx
].get();
115 /* substract the 1 or 2 sgprs from exec */
117 assert(new_demand
.sgpr
>= (int16_t) program
->lane_mask
.size());
118 register_demand
[idx
] = RegisterDemand(new_demand
.vgpr
, new_demand
.sgpr
- (exec_live
? program
->lane_mask
.size() : 0));
121 for (Definition
& definition
: insn
->definitions
) {
122 if (!definition
.isTemp()) {
125 if ((definition
.isFixed() || definition
.hasHint()) && definition
.physReg() == vcc
)
126 program
->needs_vcc
= true;
128 const Temp temp
= definition
.getTemp();
129 const size_t n
= live
.erase(temp
);
133 definition
.setKill(false);
135 register_demand
[idx
] += temp
;
136 definition
.setKill(true);
139 if (definition
.isFixed() && definition
.physReg() == exec
)
144 if (insn
->opcode
== aco_opcode::p_logical_end
) {
145 new_demand
.sgpr
+= phi_sgpr_ops
[block
->index
];
147 /* we need to do this in a separate loop because the next one can
148 * setKill() for several operands at once and we don't want to
149 * overwrite that in a later iteration */
150 for (Operand
& op
: insn
->operands
)
153 for (unsigned i
= 0; i
< insn
->operands
.size(); ++i
)
155 Operand
& operand
= insn
->operands
[i
];
156 if (!operand
.isTemp())
158 if (operand
.isFixed() && operand
.physReg() == vcc
)
159 program
->needs_vcc
= true;
160 const Temp temp
= operand
.getTemp();
161 const bool inserted
= live
.insert(temp
).second
;
163 operand
.setFirstKill(true);
164 for (unsigned j
= i
+ 1; j
< insn
->operands
.size(); ++j
) {
165 if (insn
->operands
[j
].isTemp() && insn
->operands
[j
].tempId() == operand
.tempId()) {
166 insn
->operands
[j
].setFirstKill(false);
167 insn
->operands
[j
].setKill(true);
170 if (operand
.isLateKill())
171 register_demand
[idx
] += temp
;
175 if (operand
.isFixed() && operand
.physReg() == exec
)
180 block
->register_demand
.update(register_demand
[idx
]);
183 /* update block's register demand for a last time */
185 assert(new_demand
.sgpr
>= (int16_t) program
->lane_mask
.size());
186 new_demand
.sgpr
-= exec_live
? program
->lane_mask
.size() : 0;
187 block
->register_demand
.update(new_demand
);
189 /* handle phi definitions */
191 while (phi_idx
>= 0) {
192 register_demand
[phi_idx
] = new_demand
;
193 Instruction
*insn
= block
->instructions
[phi_idx
].get();
195 assert(is_phi(insn
));
196 assert(insn
->definitions
.size() == 1 && insn
->definitions
[0].isTemp());
197 Definition
& definition
= insn
->definitions
[0];
198 if ((definition
.isFixed() || definition
.hasHint()) && definition
.physReg() == vcc
)
199 program
->needs_vcc
= true;
200 const Temp temp
= definition
.getTemp();
201 const size_t n
= live
.erase(temp
);
204 definition
.setKill(false);
206 definition
.setKill(true);
211 /* now, we need to merge the live-ins into the live-out sets */
212 for (Temp t
: live
) {
213 std::vector
<unsigned>& preds
= t
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
217 fprintf(stderr
, "Temporary never defined or are defined after use: %%%d in BB%d\n", t
.id(), block
->index
);
220 for (unsigned pred_idx
: preds
) {
221 auto it
= lives
.live_out
[pred_idx
].insert(t
);
223 worklist
.insert(pred_idx
);
227 /* handle phi operands */
229 while (phi_idx
>= 0) {
230 Instruction
*insn
= block
->instructions
[phi_idx
].get();
231 assert(is_phi(insn
));
232 /* directly insert into the predecessors live-out set */
233 std::vector
<unsigned>& preds
= insn
->opcode
== aco_opcode::p_phi
234 ? block
->logical_preds
235 : block
->linear_preds
;
236 for (unsigned i
= 0; i
< preds
.size(); ++i
) {
237 Operand
&operand
= insn
->operands
[i
];
238 if (!operand
.isTemp())
240 if (operand
.isFixed() && operand
.physReg() == vcc
)
241 program
->needs_vcc
= true;
242 /* check if we changed an already processed block */
243 const bool inserted
= lives
.live_out
[preds
[i
]].insert(operand
.getTemp()).second
;
245 operand
.setKill(true);
246 worklist
.insert(preds
[i
]);
247 if (insn
->opcode
== aco_opcode::p_phi
&& operand
.getTemp().type() == RegType::sgpr
)
248 phi_sgpr_ops
[preds
[i
]] += operand
.size();
254 assert(block
->index
!= 0 || (new_demand
== RegisterDemand() && live
.empty()));
257 unsigned calc_waves_per_workgroup(Program
*program
)
259 /* When workgroup size is not known, just go with wave_size */
260 unsigned workgroup_size
= program
->workgroup_size
== UINT_MAX
262 : program
->workgroup_size
;
264 return align(workgroup_size
, program
->wave_size
) / program
->wave_size
;
266 } /* end namespace */
268 uint16_t get_extra_sgprs(Program
*program
)
270 if (program
->chip_class
>= GFX10
) {
271 assert(!program
->needs_flat_scr
);
272 assert(!program
->xnack_enabled
);
274 } else if (program
->chip_class
>= GFX8
) {
275 if (program
->needs_flat_scr
)
277 else if (program
->xnack_enabled
)
279 else if (program
->needs_vcc
)
284 assert(!program
->xnack_enabled
);
285 if (program
->needs_flat_scr
)
287 else if (program
->needs_vcc
)
294 uint16_t get_sgpr_alloc(Program
*program
, uint16_t addressable_sgprs
)
296 assert(addressable_sgprs
<= program
->sgpr_limit
);
297 uint16_t sgprs
= addressable_sgprs
+ get_extra_sgprs(program
);
298 uint16_t granule
= program
->sgpr_alloc_granule
+ 1;
299 return align(std::max(sgprs
, granule
), granule
);
302 uint16_t get_vgpr_alloc(Program
*program
, uint16_t addressable_vgprs
)
304 assert(addressable_vgprs
<= program
->vgpr_limit
);
305 uint16_t granule
= program
->vgpr_alloc_granule
+ 1;
306 return align(std::max(addressable_vgprs
, granule
), granule
);
309 uint16_t get_addr_sgpr_from_waves(Program
*program
, uint16_t max_waves
)
311 uint16_t sgprs
= program
->physical_sgprs
/ max_waves
& ~program
->sgpr_alloc_granule
;
312 sgprs
-= get_extra_sgprs(program
);
313 return std::min(sgprs
, program
->sgpr_limit
);
316 uint16_t get_addr_vgpr_from_waves(Program
*program
, uint16_t max_waves
)
318 uint16_t vgprs
= 256 / max_waves
& ~program
->vgpr_alloc_granule
;
319 return std::min(vgprs
, program
->vgpr_limit
);
322 void calc_min_waves(Program
* program
)
324 unsigned waves_per_workgroup
= calc_waves_per_workgroup(program
);
325 /* currently min_waves is in wave64 waves */
326 if (program
->wave_size
== 32)
327 waves_per_workgroup
= DIV_ROUND_UP(waves_per_workgroup
, 2);
329 unsigned simd_per_cu
= 4; /* TODO: different on Navi */
330 bool wgp
= program
->chip_class
>= GFX10
; /* assume WGP is used on Navi */
331 unsigned simd_per_cu_wgp
= wgp
? simd_per_cu
* 2 : simd_per_cu
;
333 program
->min_waves
= DIV_ROUND_UP(waves_per_workgroup
, simd_per_cu_wgp
);
336 void update_vgpr_sgpr_demand(Program
* program
, const RegisterDemand new_demand
)
338 /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
339 unsigned max_waves_per_simd
= 10;
340 unsigned simd_per_cu
= 4;
342 bool wgp
= program
->chip_class
>= GFX10
; /* assume WGP is used on Navi */
343 unsigned simd_per_cu_wgp
= wgp
? simd_per_cu
* 2 : simd_per_cu
;
344 unsigned lds_limit
= wgp
? program
->lds_limit
* 2 : program
->lds_limit
;
346 /* this won't compile, register pressure reduction necessary */
347 if (new_demand
.vgpr
> program
->vgpr_limit
|| new_demand
.sgpr
> program
->sgpr_limit
) {
348 program
->num_waves
= 0;
349 program
->max_reg_demand
= new_demand
;
351 program
->num_waves
= program
->physical_sgprs
/ get_sgpr_alloc(program
, new_demand
.sgpr
);
352 program
->num_waves
= std::min
<uint16_t>(program
->num_waves
, 256 / get_vgpr_alloc(program
, new_demand
.vgpr
));
353 program
->max_waves
= max_waves_per_simd
;
355 /* adjust max_waves for workgroup and LDS limits */
356 unsigned waves_per_workgroup
= calc_waves_per_workgroup(program
);
357 unsigned workgroups_per_cu_wgp
= max_waves_per_simd
* simd_per_cu_wgp
/ waves_per_workgroup
;
358 if (program
->config
->lds_size
) {
359 unsigned lds
= program
->config
->lds_size
* program
->lds_alloc_granule
;
360 workgroups_per_cu_wgp
= std::min(workgroups_per_cu_wgp
, lds_limit
/ lds
);
362 if (waves_per_workgroup
> 1 && program
->chip_class
< GFX10
)
363 workgroups_per_cu_wgp
= std::min(workgroups_per_cu_wgp
, 16u); /* TODO: is this a SI-only limit? what about Navi? */
365 /* in cases like waves_per_workgroup=3 or lds=65536 and
366 * waves_per_workgroup=1, we want the maximum possible number of waves per
367 * SIMD and not the minimum. so DIV_ROUND_UP is used */
368 program
->max_waves
= std::min
<uint16_t>(program
->max_waves
, DIV_ROUND_UP(workgroups_per_cu_wgp
* waves_per_workgroup
, simd_per_cu_wgp
));
370 /* incorporate max_waves and calculate max_reg_demand */
371 program
->num_waves
= std::min
<uint16_t>(program
->num_waves
, program
->max_waves
);
372 program
->max_reg_demand
.vgpr
= get_addr_vgpr_from_waves(program
, program
->num_waves
);
373 program
->max_reg_demand
.sgpr
= get_addr_sgpr_from_waves(program
, program
->num_waves
);
377 live
live_var_analysis(Program
* program
,
378 const struct radv_nir_compiler_options
*options
)
381 result
.live_out
.resize(program
->blocks
.size());
382 result
.register_demand
.resize(program
->blocks
.size());
383 std::set
<unsigned> worklist
;
384 std::vector
<uint16_t> phi_sgpr_ops(program
->blocks
.size());
385 RegisterDemand new_demand
;
387 program
->needs_vcc
= false;
389 /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
390 for (Block
& block
: program
->blocks
)
391 worklist
.insert(block
.index
);
392 while (!worklist
.empty()) {
393 std::set
<unsigned>::reverse_iterator b_it
= worklist
.rbegin();
394 unsigned block_idx
= *b_it
;
395 worklist
.erase(block_idx
);
396 process_live_temps_per_block(program
, result
, &program
->blocks
[block_idx
], worklist
, phi_sgpr_ops
);
397 new_demand
.update(program
->blocks
[block_idx
].register_demand
);
400 /* calculate the program's register demand and number of waves */
401 update_vgpr_sgpr_demand(program
, new_demand
);