2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
26 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
31 #include "util/u_math.h"
36 #include "vulkan/radv_shader.h"
41 void process_live_temps_per_block(Program
*program
, live
& lives
, Block
* block
,
42 std::set
<unsigned>& worklist
, std::vector
<uint16_t>& phi_sgpr_ops
)
44 std::vector
<RegisterDemand
>& register_demand
= lives
.register_demand
[block
->index
];
45 RegisterDemand new_demand
;
47 register_demand
.resize(block
->instructions
.size());
48 block
->register_demand
= RegisterDemand();
50 std::set
<Temp
> live_sgprs
;
51 std::set
<Temp
> live_vgprs
;
53 /* add the live_out_exec to live */
54 bool exec_live
= false;
55 if (block
->live_out_exec
!= Temp()) {
56 live_sgprs
.insert(block
->live_out_exec
);
61 /* split the live-outs from this block into the temporary sets */
62 std::vector
<std::set
<Temp
>>& live_temps
= lives
.live_out
;
63 for (const Temp temp
: live_temps
[block
->index
]) {
64 const bool inserted
= temp
.is_linear()
65 ? live_sgprs
.insert(temp
).second
66 : live_vgprs
.insert(temp
).second
;
71 new_demand
.sgpr
-= phi_sgpr_ops
[block
->index
];
73 /* traverse the instructions backwards */
75 for (idx
= block
->instructions
.size() -1; idx
>= 0; idx
--) {
76 Instruction
*insn
= block
->instructions
[idx
].get();
80 /* substract the 2 sgprs from exec */
82 assert(new_demand
.sgpr
>= 2);
83 register_demand
[idx
] = RegisterDemand(new_demand
.vgpr
, new_demand
.sgpr
- (exec_live
? 2 : 0));
86 for (Definition
& definition
: insn
->definitions
) {
87 if (!definition
.isTemp()) {
91 const Temp temp
= definition
.getTemp();
94 n
= live_sgprs
.erase(temp
);
96 n
= live_vgprs
.erase(temp
);
100 definition
.setKill(false);
102 register_demand
[idx
] += temp
;
103 definition
.setKill(true);
106 if (definition
.isFixed() && definition
.physReg() == exec
)
111 if (insn
->opcode
== aco_opcode::p_logical_end
) {
112 new_demand
.sgpr
+= phi_sgpr_ops
[block
->index
];
114 for (unsigned i
= 0; i
< insn
->operands
.size(); ++i
)
116 Operand
& operand
= insn
->operands
[i
];
117 if (!operand
.isTemp()) {
120 const Temp temp
= operand
.getTemp();
121 const bool inserted
= temp
.is_linear()
122 ? live_sgprs
.insert(temp
).second
123 : live_vgprs
.insert(temp
).second
;
125 operand
.setFirstKill(true);
126 for (unsigned j
= i
+ 1; j
< insn
->operands
.size(); ++j
) {
127 if (insn
->operands
[j
].isTemp() && insn
->operands
[j
].tempId() == operand
.tempId()) {
128 insn
->operands
[j
].setFirstKill(false);
129 insn
->operands
[j
].setKill(true);
134 operand
.setKill(false);
137 if (operand
.isFixed() && operand
.physReg() == exec
)
142 block
->register_demand
.update(register_demand
[idx
]);
145 /* update block's register demand for a last time */
147 assert(new_demand
.sgpr
>= 2);
148 new_demand
.sgpr
-= exec_live
? 2 : 0;
149 block
->register_demand
.update(new_demand
);
151 /* handle phi definitions */
153 while (phi_idx
>= 0) {
154 register_demand
[phi_idx
] = new_demand
;
155 Instruction
*insn
= block
->instructions
[phi_idx
].get();
157 assert(is_phi(insn
));
158 assert(insn
->definitions
.size() == 1 && insn
->definitions
[0].isTemp());
159 Definition
& definition
= insn
->definitions
[0];
160 const Temp temp
= definition
.getTemp();
163 if (temp
.is_linear())
164 n
= live_sgprs
.erase(temp
);
166 n
= live_vgprs
.erase(temp
);
169 definition
.setKill(false);
171 definition
.setKill(true);
176 /* now, we have the live-in sets and need to merge them into the live-out sets */
177 for (unsigned pred_idx
: block
->logical_preds
) {
178 for (Temp vgpr
: live_vgprs
) {
179 auto it
= live_temps
[pred_idx
].insert(vgpr
);
181 worklist
.insert(pred_idx
);
185 for (unsigned pred_idx
: block
->linear_preds
) {
186 for (Temp sgpr
: live_sgprs
) {
187 auto it
= live_temps
[pred_idx
].insert(sgpr
);
189 worklist
.insert(pred_idx
);
193 /* handle phi operands */
195 while (phi_idx
>= 0) {
196 Instruction
*insn
= block
->instructions
[phi_idx
].get();
197 assert(is_phi(insn
));
198 /* directly insert into the predecessors live-out set */
199 std::vector
<unsigned>& preds
= insn
->opcode
== aco_opcode::p_phi
200 ? block
->logical_preds
201 : block
->linear_preds
;
202 for (unsigned i
= 0; i
< preds
.size(); ++i
) {
203 Operand
&operand
= insn
->operands
[i
];
204 if (!operand
.isTemp()) {
207 /* check if we changed an already processed block */
208 const bool inserted
= live_temps
[preds
[i
]].insert(operand
.getTemp()).second
;
210 operand
.setKill(true);
211 worklist
.insert(preds
[i
]);
212 if (insn
->opcode
== aco_opcode::p_phi
&& operand
.getTemp().type() == RegType::sgpr
)
213 phi_sgpr_ops
[preds
[i
]] += operand
.size();
219 if (!(block
->index
!= 0 || (live_vgprs
.empty() && live_sgprs
.empty()))) {
220 aco_print_program(program
, stderr
);
221 fprintf(stderr
, "These temporaries are never defined or are defined after use:\n");
222 for (Temp vgpr
: live_vgprs
)
223 fprintf(stderr
, "%%%d\n", vgpr
.id());
224 for (Temp sgpr
: live_sgprs
)
225 fprintf(stderr
, "%%%d\n", sgpr
.id());
229 assert(block
->index
!= 0 || new_demand
== RegisterDemand());
231 } /* end namespace */
233 uint16_t get_extra_sgprs(Program
*program
)
235 if (program
->chip_class
>= GFX10
) {
236 assert(!program
->needs_flat_scr
);
237 assert(!program
->needs_xnack_mask
);
239 } else if (program
->chip_class
>= GFX8
) {
240 if (program
->needs_flat_scr
)
242 else if (program
->needs_xnack_mask
)
244 else if (program
->needs_vcc
)
249 assert(!program
->needs_xnack_mask
);
250 if (program
->needs_flat_scr
)
252 else if (program
->needs_vcc
)
259 uint16_t get_sgpr_alloc(Program
*program
, uint16_t addressable_sgprs
)
261 assert(addressable_sgprs
<= program
->sgpr_limit
);
262 uint16_t sgprs
= addressable_sgprs
+ get_extra_sgprs(program
);
263 uint16_t granule
= program
->sgpr_alloc_granule
+ 1;
264 return align(std::max(sgprs
, granule
), granule
);
267 uint16_t get_addr_sgpr_from_waves(Program
*program
, uint16_t max_waves
)
269 uint16_t sgprs
= program
->physical_sgprs
/ max_waves
& ~program
->sgpr_alloc_granule
;
270 sgprs
-= get_extra_sgprs(program
);
271 return std::min(sgprs
, program
->sgpr_limit
);
274 void update_vgpr_sgpr_demand(Program
* program
, const RegisterDemand new_demand
)
276 /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
277 unsigned max_waves_per_simd
= 10;
278 unsigned simd_per_cu
= 4;
280 bool wgp
= program
->chip_class
>= GFX10
; /* assume WGP is used on Navi */
281 unsigned simd_per_cu_wgp
= wgp
? simd_per_cu
* 2 : simd_per_cu
;
282 unsigned lds_limit
= wgp
? program
->lds_limit
* 2 : program
->lds_limit
;
284 const int16_t vgpr_alloc
= std::max
<int16_t>(4, (new_demand
.vgpr
+ 3) & ~3);
285 /* this won't compile, register pressure reduction necessary */
286 if (new_demand
.vgpr
> program
->vgpr_limit
|| new_demand
.sgpr
> program
->sgpr_limit
) {
287 program
->num_waves
= 0;
288 program
->max_reg_demand
= new_demand
;
290 program
->num_waves
= program
->physical_sgprs
/ get_sgpr_alloc(program
, new_demand
.sgpr
);
291 program
->num_waves
= std::min
<uint16_t>(program
->num_waves
, 256 / vgpr_alloc
);
292 program
->max_waves
= max_waves_per_simd
;
294 /* adjust max_waves for workgroup and LDS limits */
295 unsigned workgroup_size
= program
->wave_size
;
296 if (program
->stage
== compute_cs
) {
297 unsigned* bsize
= program
->info
->cs
.block_size
;
298 workgroup_size
= bsize
[0] * bsize
[1] * bsize
[2];
300 unsigned waves_per_workgroup
= align(workgroup_size
, program
->wave_size
) / program
->wave_size
;
302 unsigned workgroups_per_cu_wgp
= max_waves_per_simd
* simd_per_cu_wgp
/ waves_per_workgroup
;
303 if (program
->config
->lds_size
) {
304 unsigned lds
= program
->config
->lds_size
* program
->lds_alloc_granule
;
305 workgroups_per_cu_wgp
= std::min(workgroups_per_cu_wgp
, lds_limit
/ lds
);
307 if (waves_per_workgroup
> 1 && program
->chip_class
< GFX10
)
308 workgroups_per_cu_wgp
= std::min(workgroups_per_cu_wgp
, 16u); /* TODO: is this a SI-only limit? what about Navi? */
310 /* in cases like waves_per_workgroup=3 or lds=65536 and
311 * waves_per_workgroup=1, we want the maximum possible number of waves per
312 * SIMD and not the minimum. so DIV_ROUND_UP is used */
313 program
->max_waves
= std::min
<uint16_t>(program
->max_waves
, DIV_ROUND_UP(workgroups_per_cu_wgp
* waves_per_workgroup
, simd_per_cu_wgp
));
315 /* incorporate max_waves and calculate max_reg_demand */
316 program
->num_waves
= std::min
<uint16_t>(program
->num_waves
, program
->max_waves
);
317 program
->max_reg_demand
.vgpr
= int16_t((256 / program
->num_waves
) & ~3);
318 program
->max_reg_demand
.sgpr
= get_addr_sgpr_from_waves(program
, program
->num_waves
);
322 live
live_var_analysis(Program
* program
,
323 const struct radv_nir_compiler_options
*options
)
326 result
.live_out
.resize(program
->blocks
.size());
327 result
.register_demand
.resize(program
->blocks
.size());
328 std::set
<unsigned> worklist
;
329 std::vector
<uint16_t> phi_sgpr_ops(program
->blocks
.size());
330 RegisterDemand new_demand
;
332 /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
333 for (Block
& block
: program
->blocks
)
334 worklist
.insert(block
.index
);
335 while (!worklist
.empty()) {
336 std::set
<unsigned>::reverse_iterator b_it
= worklist
.rbegin();
337 unsigned block_idx
= *b_it
;
338 worklist
.erase(block_idx
);
339 process_live_temps_per_block(program
, result
, &program
->blocks
[block_idx
], worklist
, phi_sgpr_ops
);
340 new_demand
.update(program
->blocks
[block_idx
].register_demand
);
343 /* calculate the program's register demand and number of waves */
344 update_vgpr_sgpr_demand(program
, new_demand
);