72c5284437be59fa8f9214df1a97b0c0f4e16ebd
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <stack>
29 #include <map>
30
31 #include "ac_shader_util.h"
32 #include "aco_ir.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
37
38 namespace aco {
39 namespace {
40
41 class loop_info_RAII {
42 isel_context* ctx;
43 unsigned header_idx_old;
44 Block* exit_old;
45 bool divergent_cont_old;
46 bool divergent_branch_old;
47 bool divergent_if_old;
48
49 public:
50 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
51 : ctx(ctx),
52 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
53 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
54 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
55 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
56 {
57 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
58 ctx->cf_info.parent_loop.exit = loop_exit;
59 ctx->cf_info.parent_loop.has_divergent_continue = false;
60 ctx->cf_info.parent_loop.has_divergent_branch = false;
61 ctx->cf_info.parent_if.is_divergent = false;
62 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
63 }
64
65 ~loop_info_RAII()
66 {
67 ctx->cf_info.parent_loop.header_idx = header_idx_old;
68 ctx->cf_info.parent_loop.exit = exit_old;
69 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
70 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
71 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
72 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
73 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
74 ctx->cf_info.exec_potentially_empty_discard = false;
75 }
76 };
77
78 struct if_context {
79 Temp cond;
80
81 bool divergent_old;
82 bool exec_potentially_empty_discard_old;
83 bool exec_potentially_empty_break_old;
84 uint16_t exec_potentially_empty_break_depth_old;
85
86 unsigned BB_if_idx;
87 unsigned invert_idx;
88 bool then_branch_divergent;
89 Block BB_invert;
90 Block BB_endif;
91 };
92
93 static void visit_cf_list(struct isel_context *ctx,
94 struct exec_list *list);
95
96 static void add_logical_edge(unsigned pred_idx, Block *succ)
97 {
98 succ->logical_preds.emplace_back(pred_idx);
99 }
100
101
102 static void add_linear_edge(unsigned pred_idx, Block *succ)
103 {
104 succ->linear_preds.emplace_back(pred_idx);
105 }
106
107 static void add_edge(unsigned pred_idx, Block *succ)
108 {
109 add_logical_edge(pred_idx, succ);
110 add_linear_edge(pred_idx, succ);
111 }
112
113 static void append_logical_start(Block *b)
114 {
115 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
116 }
117
118 static void append_logical_end(Block *b)
119 {
120 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
121 }
122
123 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
124 {
125 assert(ctx->allocated[def->index].id());
126 return ctx->allocated[def->index];
127 }
128
129 Temp emit_mbcnt(isel_context *ctx, Definition dst,
130 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
131 {
132 Builder bld(ctx->program, ctx->block);
133 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
134 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
135
136 if (ctx->program->wave_size == 32) {
137 return thread_id_lo;
138 } else {
139 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
140 return thread_id_hi;
141 }
142 }
143
144 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
145 {
146 Builder bld(ctx->program, ctx->block);
147
148 if (!dst.id())
149 dst = bld.tmp(src.regClass());
150
151 assert(src.size() == dst.size());
152
153 if (ctx->stage != fragment_fs) {
154 if (!dst.id())
155 return src;
156
157 bld.copy(Definition(dst), src);
158 return dst;
159 }
160
161 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
162 ctx->program->needs_wqm |= program_needs_wqm;
163 return dst;
164 }
165
166 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
167 {
168 if (index.regClass() == s1)
169 return bld.readlane(bld.def(s1), data, index);
170
171 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
172
173 /* Currently not implemented on GFX6-7 */
174 assert(ctx->options->chip_class >= GFX8);
175
176 if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
177 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
178 }
179
180 /* GFX10, wave64 mode:
181 * The bpermute instruction is limited to half-wave operation, which means that it can't
182 * properly support subgroup shuffle like older generations (or wave32 mode), so we
183 * emulate it here.
184 */
185 if (!ctx->has_gfx10_wave64_bpermute) {
186 ctx->has_gfx10_wave64_bpermute = true;
187 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
188 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
189 }
190
191 Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
192 Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
193 Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
194 Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
195
196 return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
197 bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
198 }
199
200 Temp as_vgpr(isel_context *ctx, Temp val)
201 {
202 if (val.type() == RegType::sgpr) {
203 Builder bld(ctx->program, ctx->block);
204 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
205 }
206 assert(val.type() == RegType::vgpr);
207 return val;
208 }
209
210 //assumes a != 0xffffffff
211 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
212 {
213 assert(b != 0);
214 Builder bld(ctx->program, ctx->block);
215
216 if (util_is_power_of_two_or_zero(b)) {
217 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
218 return;
219 }
220
221 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
222
223 assert(info.multiplier <= 0xffffffff);
224
225 bool pre_shift = info.pre_shift != 0;
226 bool increment = info.increment != 0;
227 bool multiply = true;
228 bool post_shift = info.post_shift != 0;
229
230 if (!pre_shift && !increment && !multiply && !post_shift) {
231 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
232 return;
233 }
234
235 Temp pre_shift_dst = a;
236 if (pre_shift) {
237 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
238 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
239 }
240
241 Temp increment_dst = pre_shift_dst;
242 if (increment) {
243 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
244 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
245 }
246
247 Temp multiply_dst = increment_dst;
248 if (multiply) {
249 multiply_dst = post_shift ? bld.tmp(v1) : dst;
250 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
251 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
252 }
253
254 if (post_shift) {
255 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
256 }
257 }
258
259 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
260 {
261 Builder bld(ctx->program, ctx->block);
262 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
263 }
264
265
266 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
267 {
268 /* no need to extract the whole vector */
269 if (src.regClass() == dst_rc) {
270 assert(idx == 0);
271 return src;
272 }
273 assert(src.size() > idx);
274 Builder bld(ctx->program, ctx->block);
275 auto it = ctx->allocated_vec.find(src.id());
276 /* the size check needs to be early because elements other than 0 may be garbage */
277 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
278 if (it->second[idx].regClass() == dst_rc) {
279 return it->second[idx];
280 } else {
281 assert(dst_rc.size() == it->second[idx].regClass().size());
282 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
283 return bld.copy(bld.def(dst_rc), it->second[idx]);
284 }
285 }
286
287 if (src.size() == dst_rc.size()) {
288 assert(idx == 0);
289 return bld.copy(bld.def(dst_rc), src);
290 } else {
291 Temp dst = bld.tmp(dst_rc);
292 emit_extract_vector(ctx, src, idx, dst);
293 return dst;
294 }
295 }
296
297 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
298 {
299 if (num_components == 1)
300 return;
301 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
302 return;
303 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
304 split->operands[0] = Operand(vec_src);
305 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
306 for (unsigned i = 0; i < num_components; i++) {
307 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
308 split->definitions[i] = Definition(elems[i]);
309 }
310 ctx->block->instructions.emplace_back(std::move(split));
311 ctx->allocated_vec.emplace(vec_src.id(), elems);
312 }
313
314 /* This vector expansion uses a mask to determine which elements in the new vector
315 * come from the original vector. The other elements are undefined. */
316 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
317 {
318 emit_split_vector(ctx, vec_src, util_bitcount(mask));
319
320 if (vec_src == dst)
321 return;
322
323 Builder bld(ctx->program, ctx->block);
324 if (num_components == 1) {
325 if (dst.type() == RegType::sgpr)
326 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
327 else
328 bld.copy(Definition(dst), vec_src);
329 return;
330 }
331
332 unsigned component_size = dst.size() / num_components;
333 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
334
335 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
336 vec->definitions[0] = Definition(dst);
337 unsigned k = 0;
338 for (unsigned i = 0; i < num_components; i++) {
339 if (mask & (1 << i)) {
340 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
341 if (dst.type() == RegType::sgpr)
342 src = bld.as_uniform(src);
343 vec->operands[i] = Operand(src);
344 } else {
345 vec->operands[i] = Operand(0u);
346 }
347 elems[i] = vec->operands[i].getTemp();
348 }
349 ctx->block->instructions.emplace_back(std::move(vec));
350 ctx->allocated_vec.emplace(dst.id(), elems);
351 }
352
353 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
354 {
355 Builder bld(ctx->program, ctx->block);
356 if (!dst.id())
357 dst = bld.tmp(bld.lm);
358
359 assert(val.regClass() == s1);
360 assert(dst.regClass() == bld.lm);
361
362 return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
363 }
364
365 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
366 {
367 Builder bld(ctx->program, ctx->block);
368 if (!dst.id())
369 dst = bld.tmp(s1);
370
371 assert(val.regClass() == bld.lm);
372 assert(dst.regClass() == s1);
373
374 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
375 Temp tmp = bld.tmp(s1);
376 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
377 return emit_wqm(ctx, tmp, dst);
378 }
379
380 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
381 {
382 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
383 return get_ssa_temp(ctx, src.src.ssa);
384
385 if (src.src.ssa->num_components == size) {
386 bool identity_swizzle = true;
387 for (unsigned i = 0; identity_swizzle && i < size; i++) {
388 if (src.swizzle[i] != i)
389 identity_swizzle = false;
390 }
391 if (identity_swizzle)
392 return get_ssa_temp(ctx, src.src.ssa);
393 }
394
395 Temp vec = get_ssa_temp(ctx, src.src.ssa);
396 unsigned elem_size = vec.size() / src.src.ssa->num_components;
397 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
398 assert(vec.size() % elem_size == 0);
399
400 RegClass elem_rc = RegClass(vec.type(), elem_size);
401 if (size == 1) {
402 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
403 } else {
404 assert(size <= 4);
405 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
406 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
407 for (unsigned i = 0; i < size; ++i) {
408 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
409 vec_instr->operands[i] = Operand{elems[i]};
410 }
411 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
412 vec_instr->definitions[0] = Definition(dst);
413 ctx->block->instructions.emplace_back(std::move(vec_instr));
414 ctx->allocated_vec.emplace(dst.id(), elems);
415 return dst;
416 }
417 }
418
419 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
420 {
421 if (ptr.size() == 2)
422 return ptr;
423 Builder bld(ctx->program, ctx->block);
424 if (ptr.type() == RegType::vgpr)
425 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
426 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
427 ptr, Operand((unsigned)ctx->options->address32_hi));
428 }
429
430 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
431 {
432 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
433 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
434 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
435 sop2->definitions[0] = Definition(dst);
436 if (writes_scc)
437 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
438 ctx->block->instructions.emplace_back(std::move(sop2));
439 }
440
441 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
442 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
443 {
444 Builder bld(ctx->program, ctx->block);
445 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
446 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
447 if (src1.type() == RegType::sgpr) {
448 if (commutative && src0.type() == RegType::vgpr) {
449 Temp t = src0;
450 src0 = src1;
451 src1 = t;
452 } else if (src0.type() == RegType::vgpr &&
453 op != aco_opcode::v_madmk_f32 &&
454 op != aco_opcode::v_madak_f32 &&
455 op != aco_opcode::v_madmk_f16 &&
456 op != aco_opcode::v_madak_f16) {
457 /* If the instruction is not commutative, we emit a VOP3A instruction */
458 bld.vop2_e64(op, Definition(dst), src0, src1);
459 return;
460 } else {
461 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
462 }
463 }
464
465 if (flush_denorms && ctx->program->chip_class < GFX9) {
466 assert(dst.size() == 1);
467 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
468 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
469 } else {
470 bld.vop2(op, Definition(dst), src0, src1);
471 }
472 }
473
474 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
475 bool flush_denorms = false)
476 {
477 Temp src0 = get_alu_src(ctx, instr->src[0]);
478 Temp src1 = get_alu_src(ctx, instr->src[1]);
479 Temp src2 = get_alu_src(ctx, instr->src[2]);
480
481 /* ensure that the instruction has at most 1 sgpr operand
482 * The optimizer will inline constants for us */
483 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
484 src0 = as_vgpr(ctx, src0);
485 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
486 src1 = as_vgpr(ctx, src1);
487 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
488 src2 = as_vgpr(ctx, src2);
489
490 Builder bld(ctx->program, ctx->block);
491 if (flush_denorms && ctx->program->chip_class < GFX9) {
492 assert(dst.size() == 1);
493 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
494 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
495 } else {
496 bld.vop3(op, Definition(dst), src0, src1, src2);
497 }
498 }
499
500 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
501 {
502 Builder bld(ctx->program, ctx->block);
503 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
504 }
505
506 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
507 {
508 Temp src0 = get_alu_src(ctx, instr->src[0]);
509 Temp src1 = get_alu_src(ctx, instr->src[1]);
510 assert(src0.size() == src1.size());
511
512 aco_ptr<Instruction> vopc;
513 if (src1.type() == RegType::sgpr) {
514 if (src0.type() == RegType::vgpr) {
515 /* to swap the operands, we might also have to change the opcode */
516 switch (op) {
517 case aco_opcode::v_cmp_lt_f32:
518 op = aco_opcode::v_cmp_gt_f32;
519 break;
520 case aco_opcode::v_cmp_ge_f32:
521 op = aco_opcode::v_cmp_le_f32;
522 break;
523 case aco_opcode::v_cmp_lt_i32:
524 op = aco_opcode::v_cmp_gt_i32;
525 break;
526 case aco_opcode::v_cmp_ge_i32:
527 op = aco_opcode::v_cmp_le_i32;
528 break;
529 case aco_opcode::v_cmp_lt_u32:
530 op = aco_opcode::v_cmp_gt_u32;
531 break;
532 case aco_opcode::v_cmp_ge_u32:
533 op = aco_opcode::v_cmp_le_u32;
534 break;
535 case aco_opcode::v_cmp_lt_f64:
536 op = aco_opcode::v_cmp_gt_f64;
537 break;
538 case aco_opcode::v_cmp_ge_f64:
539 op = aco_opcode::v_cmp_le_f64;
540 break;
541 case aco_opcode::v_cmp_lt_i64:
542 op = aco_opcode::v_cmp_gt_i64;
543 break;
544 case aco_opcode::v_cmp_ge_i64:
545 op = aco_opcode::v_cmp_le_i64;
546 break;
547 case aco_opcode::v_cmp_lt_u64:
548 op = aco_opcode::v_cmp_gt_u64;
549 break;
550 case aco_opcode::v_cmp_ge_u64:
551 op = aco_opcode::v_cmp_le_u64;
552 break;
553 default: /* eq and ne are commutative */
554 break;
555 }
556 Temp t = src0;
557 src0 = src1;
558 src1 = t;
559 } else {
560 src1 = as_vgpr(ctx, src1);
561 }
562 }
563
564 Builder bld(ctx->program, ctx->block);
565 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
566 }
567
568 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
569 {
570 Temp src0 = get_alu_src(ctx, instr->src[0]);
571 Temp src1 = get_alu_src(ctx, instr->src[1]);
572 Builder bld(ctx->program, ctx->block);
573
574 assert(dst.regClass() == bld.lm);
575 assert(src0.type() == RegType::sgpr);
576 assert(src1.type() == RegType::sgpr);
577 assert(src0.regClass() == src1.regClass());
578
579 /* Emit the SALU comparison instruction */
580 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
581 /* Turn the result into a per-lane bool */
582 bool_to_vector_condition(ctx, cmp, dst);
583 }
584
585 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
586 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
587 {
588 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
589 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
590 bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
591 bool use_valu = s_op == aco_opcode::num_opcodes ||
592 divergent_vals ||
593 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
594 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
595 aco_opcode op = use_valu ? v_op : s_op;
596 assert(op != aco_opcode::num_opcodes);
597 assert(dst.regClass() == ctx->program->lane_mask);
598
599 if (use_valu)
600 emit_vopc_instruction(ctx, instr, op, dst);
601 else
602 emit_sopc_instruction(ctx, instr, op, dst);
603 }
604
605 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
606 {
607 Builder bld(ctx->program, ctx->block);
608 Temp src0 = get_alu_src(ctx, instr->src[0]);
609 Temp src1 = get_alu_src(ctx, instr->src[1]);
610
611 assert(dst.regClass() == bld.lm);
612 assert(src0.regClass() == bld.lm);
613 assert(src1.regClass() == bld.lm);
614
615 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
616 }
617
618 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
619 {
620 Builder bld(ctx->program, ctx->block);
621 Temp cond = get_alu_src(ctx, instr->src[0]);
622 Temp then = get_alu_src(ctx, instr->src[1]);
623 Temp els = get_alu_src(ctx, instr->src[2]);
624
625 assert(cond.regClass() == bld.lm);
626
627 if (dst.type() == RegType::vgpr) {
628 aco_ptr<Instruction> bcsel;
629 if (dst.size() == 1) {
630 then = as_vgpr(ctx, then);
631 els = as_vgpr(ctx, els);
632
633 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
634 } else if (dst.size() == 2) {
635 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
636 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
637 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
638 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
639
640 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
641 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
642
643 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
644 } else {
645 fprintf(stderr, "Unimplemented NIR instr bit size: ");
646 nir_print_instr(&instr->instr, stderr);
647 fprintf(stderr, "\n");
648 }
649 return;
650 }
651
652 if (instr->dest.dest.ssa.bit_size == 1) {
653 assert(dst.regClass() == bld.lm);
654 assert(then.regClass() == bld.lm);
655 assert(els.regClass() == bld.lm);
656 }
657
658 if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
659 if (dst.regClass() == s1 || dst.regClass() == s2) {
660 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
661 assert(dst.size() == then.size());
662 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
663 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
664 } else {
665 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
666 nir_print_instr(&instr->instr, stderr);
667 fprintf(stderr, "\n");
668 }
669 return;
670 }
671
672 /* divergent boolean bcsel
673 * this implements bcsel on bools: dst = s0 ? s1 : s2
674 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
675 assert(instr->dest.dest.ssa.bit_size == 1);
676
677 if (cond.id() != then.id())
678 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
679
680 if (cond.id() == els.id())
681 bld.sop1(Builder::s_mov, Definition(dst), then);
682 else
683 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
684 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
685 }
686
687 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
688 aco_opcode op, uint32_t undo)
689 {
690 /* multiply by 16777216 to handle denormals */
691 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
692 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
693 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
694 scaled = bld.vop1(op, bld.def(v1), scaled);
695 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
696
697 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
698
699 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
700 }
701
702 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
703 {
704 if (ctx->block->fp_mode.denorm32 == 0) {
705 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
706 return;
707 }
708
709 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
710 }
711
712 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
713 {
714 if (ctx->block->fp_mode.denorm32 == 0) {
715 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
716 return;
717 }
718
719 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
720 }
721
722 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
723 {
724 if (ctx->block->fp_mode.denorm32 == 0) {
725 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
726 return;
727 }
728
729 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
730 }
731
732 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
733 {
734 if (ctx->block->fp_mode.denorm32 == 0) {
735 bld.vop1(aco_opcode::v_log_f32, dst, val);
736 return;
737 }
738
739 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
740 }
741
742 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
743 {
744 if (ctx->options->chip_class >= GFX7)
745 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
746
747 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
748 /* TODO: create more efficient code! */
749 if (val.type() == RegType::sgpr)
750 val = as_vgpr(ctx, val);
751
752 /* Split the input value. */
753 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
754 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
755
756 /* Extract the exponent and compute the unbiased value. */
757 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
758
759 /* Extract the fractional part. */
760 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
761 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
762
763 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
764 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
765
766 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
767 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
768 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
769 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
770 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
771
772 /* Get the sign bit. */
773 Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
774
775 /* Decide the operation to apply depending on the unbiased exponent. */
776 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
777 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
778 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
779 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
780 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
781 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
782
783 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
784 }
785
786 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
787 {
788 if (ctx->options->chip_class >= GFX7)
789 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
790
791 /* GFX6 doesn't support V_FLOOR_F64, lower it. */
792 Temp src0 = as_vgpr(ctx, val);
793
794 Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
795 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
796
797 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
798 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
799 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
800
801 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
802 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
803 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
804 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
805
806 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
807 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
808
809 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
810
811 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
812 static_cast<VOP3A_instruction*>(add)->neg[1] = true;
813
814 return add->definitions[0].getTemp();
815 }
816
817 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
818 {
819 if (!instr->dest.dest.is_ssa) {
820 fprintf(stderr, "nir alu dst not in ssa: ");
821 nir_print_instr(&instr->instr, stderr);
822 fprintf(stderr, "\n");
823 abort();
824 }
825 Builder bld(ctx->program, ctx->block);
826 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
827 switch(instr->op) {
828 case nir_op_vec2:
829 case nir_op_vec3:
830 case nir_op_vec4: {
831 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
832 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
833 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
834 elems[i] = get_alu_src(ctx, instr->src[i]);
835 vec->operands[i] = Operand{elems[i]};
836 }
837 vec->definitions[0] = Definition(dst);
838 ctx->block->instructions.emplace_back(std::move(vec));
839 ctx->allocated_vec.emplace(dst.id(), elems);
840 break;
841 }
842 case nir_op_mov: {
843 Temp src = get_alu_src(ctx, instr->src[0]);
844 aco_ptr<Instruction> mov;
845 if (dst.type() == RegType::sgpr) {
846 if (src.type() == RegType::vgpr)
847 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
848 else if (src.regClass() == s1)
849 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
850 else if (src.regClass() == s2)
851 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
852 else
853 unreachable("wrong src register class for nir_op_imov");
854 } else if (dst.regClass() == v1) {
855 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
856 } else if (dst.regClass() == v2) {
857 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
858 } else {
859 nir_print_instr(&instr->instr, stderr);
860 unreachable("Should have been lowered to scalar.");
861 }
862 break;
863 }
864 case nir_op_inot: {
865 Temp src = get_alu_src(ctx, instr->src[0]);
866 if (instr->dest.dest.ssa.bit_size == 1) {
867 assert(src.regClass() == bld.lm);
868 assert(dst.regClass() == bld.lm);
869 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
870 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
871 bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
872 } else if (dst.regClass() == v1) {
873 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
874 } else if (dst.type() == RegType::sgpr) {
875 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
876 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
877 } else {
878 fprintf(stderr, "Unimplemented NIR instr bit size: ");
879 nir_print_instr(&instr->instr, stderr);
880 fprintf(stderr, "\n");
881 }
882 break;
883 }
884 case nir_op_ineg: {
885 Temp src = get_alu_src(ctx, instr->src[0]);
886 if (dst.regClass() == v1) {
887 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
888 } else if (dst.regClass() == s1) {
889 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
890 } else if (dst.size() == 2) {
891 Temp src0 = bld.tmp(dst.type(), 1);
892 Temp src1 = bld.tmp(dst.type(), 1);
893 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
894
895 if (dst.regClass() == s2) {
896 Temp carry = bld.tmp(s1);
897 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
898 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
899 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
900 } else {
901 Temp lower = bld.tmp(v1);
902 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
903 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
904 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
905 }
906 } else {
907 fprintf(stderr, "Unimplemented NIR instr bit size: ");
908 nir_print_instr(&instr->instr, stderr);
909 fprintf(stderr, "\n");
910 }
911 break;
912 }
913 case nir_op_iabs: {
914 if (dst.regClass() == s1) {
915 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
916 } else if (dst.regClass() == v1) {
917 Temp src = get_alu_src(ctx, instr->src[0]);
918 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
919 } else {
920 fprintf(stderr, "Unimplemented NIR instr bit size: ");
921 nir_print_instr(&instr->instr, stderr);
922 fprintf(stderr, "\n");
923 }
924 break;
925 }
926 case nir_op_isign: {
927 Temp src = get_alu_src(ctx, instr->src[0]);
928 if (dst.regClass() == s1) {
929 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
930 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
931 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
932 } else if (dst.regClass() == s2) {
933 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
934 Temp neqz;
935 if (ctx->program->chip_class >= GFX8)
936 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
937 else
938 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
939 /* SCC gets zero-extended to 64 bit */
940 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
941 } else if (dst.regClass() == v1) {
942 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
943 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
944 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
945 } else if (dst.regClass() == v2) {
946 Temp upper = emit_extract_vector(ctx, src, 1, v1);
947 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
948 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
949 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
950 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
951 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
952 } else {
953 fprintf(stderr, "Unimplemented NIR instr bit size: ");
954 nir_print_instr(&instr->instr, stderr);
955 fprintf(stderr, "\n");
956 }
957 break;
958 }
959 case nir_op_imax: {
960 if (dst.regClass() == v1) {
961 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
962 } else if (dst.regClass() == s1) {
963 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
964 } else {
965 fprintf(stderr, "Unimplemented NIR instr bit size: ");
966 nir_print_instr(&instr->instr, stderr);
967 fprintf(stderr, "\n");
968 }
969 break;
970 }
971 case nir_op_umax: {
972 if (dst.regClass() == v1) {
973 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
974 } else if (dst.regClass() == s1) {
975 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
976 } else {
977 fprintf(stderr, "Unimplemented NIR instr bit size: ");
978 nir_print_instr(&instr->instr, stderr);
979 fprintf(stderr, "\n");
980 }
981 break;
982 }
983 case nir_op_imin: {
984 if (dst.regClass() == v1) {
985 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
986 } else if (dst.regClass() == s1) {
987 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
988 } else {
989 fprintf(stderr, "Unimplemented NIR instr bit size: ");
990 nir_print_instr(&instr->instr, stderr);
991 fprintf(stderr, "\n");
992 }
993 break;
994 }
995 case nir_op_umin: {
996 if (dst.regClass() == v1) {
997 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
998 } else if (dst.regClass() == s1) {
999 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1000 } else {
1001 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1002 nir_print_instr(&instr->instr, stderr);
1003 fprintf(stderr, "\n");
1004 }
1005 break;
1006 }
1007 case nir_op_ior: {
1008 if (instr->dest.dest.ssa.bit_size == 1) {
1009 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1010 } else if (dst.regClass() == v1) {
1011 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1012 } else if (dst.regClass() == s1) {
1013 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1014 } else if (dst.regClass() == s2) {
1015 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1016 } else {
1017 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1018 nir_print_instr(&instr->instr, stderr);
1019 fprintf(stderr, "\n");
1020 }
1021 break;
1022 }
1023 case nir_op_iand: {
1024 if (instr->dest.dest.ssa.bit_size == 1) {
1025 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1026 } else if (dst.regClass() == v1) {
1027 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1028 } else if (dst.regClass() == s1) {
1029 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1030 } else if (dst.regClass() == s2) {
1031 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1032 } else {
1033 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1034 nir_print_instr(&instr->instr, stderr);
1035 fprintf(stderr, "\n");
1036 }
1037 break;
1038 }
1039 case nir_op_ixor: {
1040 if (instr->dest.dest.ssa.bit_size == 1) {
1041 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1042 } else if (dst.regClass() == v1) {
1043 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1044 } else if (dst.regClass() == s1) {
1045 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1046 } else if (dst.regClass() == s2) {
1047 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1048 } else {
1049 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1050 nir_print_instr(&instr->instr, stderr);
1051 fprintf(stderr, "\n");
1052 }
1053 break;
1054 }
1055 case nir_op_ushr: {
1056 if (dst.regClass() == v1) {
1057 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1058 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1059 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1060 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1061 } else if (dst.regClass() == v2) {
1062 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1063 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1064 } else if (dst.regClass() == s2) {
1065 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1066 } else if (dst.regClass() == s1) {
1067 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1068 } else {
1069 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1070 nir_print_instr(&instr->instr, stderr);
1071 fprintf(stderr, "\n");
1072 }
1073 break;
1074 }
1075 case nir_op_ishl: {
1076 if (dst.regClass() == v1) {
1077 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1078 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1079 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1080 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1081 } else if (dst.regClass() == v2) {
1082 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1083 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1084 } else if (dst.regClass() == s1) {
1085 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1086 } else if (dst.regClass() == s2) {
1087 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1088 } else {
1089 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1090 nir_print_instr(&instr->instr, stderr);
1091 fprintf(stderr, "\n");
1092 }
1093 break;
1094 }
1095 case nir_op_ishr: {
1096 if (dst.regClass() == v1) {
1097 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1098 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1099 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1100 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1101 } else if (dst.regClass() == v2) {
1102 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1103 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1104 } else if (dst.regClass() == s1) {
1105 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1106 } else if (dst.regClass() == s2) {
1107 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1108 } else {
1109 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1110 nir_print_instr(&instr->instr, stderr);
1111 fprintf(stderr, "\n");
1112 }
1113 break;
1114 }
1115 case nir_op_find_lsb: {
1116 Temp src = get_alu_src(ctx, instr->src[0]);
1117 if (src.regClass() == s1) {
1118 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1119 } else if (src.regClass() == v1) {
1120 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1121 } else if (src.regClass() == s2) {
1122 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1123 } else {
1124 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1125 nir_print_instr(&instr->instr, stderr);
1126 fprintf(stderr, "\n");
1127 }
1128 break;
1129 }
1130 case nir_op_ufind_msb:
1131 case nir_op_ifind_msb: {
1132 Temp src = get_alu_src(ctx, instr->src[0]);
1133 if (src.regClass() == s1 || src.regClass() == s2) {
1134 aco_opcode op = src.regClass() == s2 ?
1135 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1136 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1137 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1138
1139 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1140 Operand(src.size() * 32u - 1u), msb_rev);
1141 Temp msb = sub.def(0).getTemp();
1142 Temp carry = sub.def(1).getTemp();
1143
1144 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1145 } else if (src.regClass() == v1) {
1146 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1147 Temp msb_rev = bld.tmp(v1);
1148 emit_vop1_instruction(ctx, instr, op, msb_rev);
1149 Temp msb = bld.tmp(v1);
1150 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1151 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1152 } else {
1153 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1154 nir_print_instr(&instr->instr, stderr);
1155 fprintf(stderr, "\n");
1156 }
1157 break;
1158 }
1159 case nir_op_bitfield_reverse: {
1160 if (dst.regClass() == s1) {
1161 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1162 } else if (dst.regClass() == v1) {
1163 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1164 } else {
1165 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1166 nir_print_instr(&instr->instr, stderr);
1167 fprintf(stderr, "\n");
1168 }
1169 break;
1170 }
1171 case nir_op_iadd: {
1172 if (dst.regClass() == s1) {
1173 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1174 break;
1175 }
1176
1177 Temp src0 = get_alu_src(ctx, instr->src[0]);
1178 Temp src1 = get_alu_src(ctx, instr->src[1]);
1179 if (dst.regClass() == v1) {
1180 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1181 break;
1182 }
1183
1184 assert(src0.size() == 2 && src1.size() == 2);
1185 Temp src00 = bld.tmp(src0.type(), 1);
1186 Temp src01 = bld.tmp(dst.type(), 1);
1187 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1188 Temp src10 = bld.tmp(src1.type(), 1);
1189 Temp src11 = bld.tmp(dst.type(), 1);
1190 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1191
1192 if (dst.regClass() == s2) {
1193 Temp carry = bld.tmp(s1);
1194 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1195 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1196 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1197 } else if (dst.regClass() == v2) {
1198 Temp dst0 = bld.tmp(v1);
1199 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1200 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1201 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1202 } else {
1203 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1204 nir_print_instr(&instr->instr, stderr);
1205 fprintf(stderr, "\n");
1206 }
1207 break;
1208 }
1209 case nir_op_uadd_sat: {
1210 Temp src0 = get_alu_src(ctx, instr->src[0]);
1211 Temp src1 = get_alu_src(ctx, instr->src[1]);
1212 if (dst.regClass() == s1) {
1213 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1214 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1215 src0, src1);
1216 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1217 } else if (dst.regClass() == v1) {
1218 if (ctx->options->chip_class >= GFX9) {
1219 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1220 add->operands[0] = Operand(src0);
1221 add->operands[1] = Operand(src1);
1222 add->definitions[0] = Definition(dst);
1223 add->clamp = 1;
1224 ctx->block->instructions.emplace_back(std::move(add));
1225 } else {
1226 if (src1.regClass() != v1)
1227 std::swap(src0, src1);
1228 assert(src1.regClass() == v1);
1229 Temp tmp = bld.tmp(v1);
1230 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1231 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1232 }
1233 } else {
1234 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1235 nir_print_instr(&instr->instr, stderr);
1236 fprintf(stderr, "\n");
1237 }
1238 break;
1239 }
1240 case nir_op_uadd_carry: {
1241 Temp src0 = get_alu_src(ctx, instr->src[0]);
1242 Temp src1 = get_alu_src(ctx, instr->src[1]);
1243 if (dst.regClass() == s1) {
1244 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1245 break;
1246 }
1247 if (dst.regClass() == v1) {
1248 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1249 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1250 break;
1251 }
1252
1253 Temp src00 = bld.tmp(src0.type(), 1);
1254 Temp src01 = bld.tmp(dst.type(), 1);
1255 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1256 Temp src10 = bld.tmp(src1.type(), 1);
1257 Temp src11 = bld.tmp(dst.type(), 1);
1258 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1259 if (dst.regClass() == s2) {
1260 Temp carry = bld.tmp(s1);
1261 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1262 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1263 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1264 } else if (dst.regClass() == v2) {
1265 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1266 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1267 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1268 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1269 } else {
1270 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1271 nir_print_instr(&instr->instr, stderr);
1272 fprintf(stderr, "\n");
1273 }
1274 break;
1275 }
1276 case nir_op_isub: {
1277 if (dst.regClass() == s1) {
1278 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1279 break;
1280 }
1281
1282 Temp src0 = get_alu_src(ctx, instr->src[0]);
1283 Temp src1 = get_alu_src(ctx, instr->src[1]);
1284 if (dst.regClass() == v1) {
1285 bld.vsub32(Definition(dst), src0, src1);
1286 break;
1287 }
1288
1289 Temp src00 = bld.tmp(src0.type(), 1);
1290 Temp src01 = bld.tmp(dst.type(), 1);
1291 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1292 Temp src10 = bld.tmp(src1.type(), 1);
1293 Temp src11 = bld.tmp(dst.type(), 1);
1294 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1295 if (dst.regClass() == s2) {
1296 Temp carry = bld.tmp(s1);
1297 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1298 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1299 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1300 } else if (dst.regClass() == v2) {
1301 Temp lower = bld.tmp(v1);
1302 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1303 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1304 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1305 } else {
1306 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1307 nir_print_instr(&instr->instr, stderr);
1308 fprintf(stderr, "\n");
1309 }
1310 break;
1311 }
1312 case nir_op_usub_borrow: {
1313 Temp src0 = get_alu_src(ctx, instr->src[0]);
1314 Temp src1 = get_alu_src(ctx, instr->src[1]);
1315 if (dst.regClass() == s1) {
1316 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1317 break;
1318 } else if (dst.regClass() == v1) {
1319 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1320 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1321 break;
1322 }
1323
1324 Temp src00 = bld.tmp(src0.type(), 1);
1325 Temp src01 = bld.tmp(dst.type(), 1);
1326 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1327 Temp src10 = bld.tmp(src1.type(), 1);
1328 Temp src11 = bld.tmp(dst.type(), 1);
1329 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1330 if (dst.regClass() == s2) {
1331 Temp borrow = bld.tmp(s1);
1332 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1333 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1334 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1335 } else if (dst.regClass() == v2) {
1336 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1337 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1338 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1339 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1340 } else {
1341 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1342 nir_print_instr(&instr->instr, stderr);
1343 fprintf(stderr, "\n");
1344 }
1345 break;
1346 }
1347 case nir_op_imul: {
1348 if (dst.regClass() == v1) {
1349 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1350 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1351 } else if (dst.regClass() == s1) {
1352 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1353 } else {
1354 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1355 nir_print_instr(&instr->instr, stderr);
1356 fprintf(stderr, "\n");
1357 }
1358 break;
1359 }
1360 case nir_op_umul_high: {
1361 if (dst.regClass() == v1) {
1362 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1363 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1364 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1365 } else if (dst.regClass() == s1) {
1366 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1367 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1368 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1369 } else {
1370 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1371 nir_print_instr(&instr->instr, stderr);
1372 fprintf(stderr, "\n");
1373 }
1374 break;
1375 }
1376 case nir_op_imul_high: {
1377 if (dst.regClass() == v1) {
1378 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1379 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1380 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1381 } else if (dst.regClass() == s1) {
1382 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1383 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1384 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1385 } else {
1386 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1387 nir_print_instr(&instr->instr, stderr);
1388 fprintf(stderr, "\n");
1389 }
1390 break;
1391 }
1392 case nir_op_fmul: {
1393 if (dst.size() == 1) {
1394 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1395 } else if (dst.size() == 2) {
1396 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1397 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1398 } else {
1399 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1400 nir_print_instr(&instr->instr, stderr);
1401 fprintf(stderr, "\n");
1402 }
1403 break;
1404 }
1405 case nir_op_fadd: {
1406 if (dst.size() == 1) {
1407 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1408 } else if (dst.size() == 2) {
1409 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1410 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1411 } else {
1412 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1413 nir_print_instr(&instr->instr, stderr);
1414 fprintf(stderr, "\n");
1415 }
1416 break;
1417 }
1418 case nir_op_fsub: {
1419 Temp src0 = get_alu_src(ctx, instr->src[0]);
1420 Temp src1 = get_alu_src(ctx, instr->src[1]);
1421 if (dst.size() == 1) {
1422 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1423 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1424 else
1425 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1426 } else if (dst.size() == 2) {
1427 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1428 get_alu_src(ctx, instr->src[0]),
1429 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1430 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1431 sub->neg[1] = true;
1432 } else {
1433 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1434 nir_print_instr(&instr->instr, stderr);
1435 fprintf(stderr, "\n");
1436 }
1437 break;
1438 }
1439 case nir_op_fmax: {
1440 if (dst.size() == 1) {
1441 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1442 } else if (dst.size() == 2) {
1443 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1444 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
1445 get_alu_src(ctx, instr->src[0]),
1446 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1447 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1448 } else {
1449 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1450 get_alu_src(ctx, instr->src[0]),
1451 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1452 }
1453 } else {
1454 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1455 nir_print_instr(&instr->instr, stderr);
1456 fprintf(stderr, "\n");
1457 }
1458 break;
1459 }
1460 case nir_op_fmin: {
1461 if (dst.size() == 1) {
1462 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1463 } else if (dst.size() == 2) {
1464 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1465 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
1466 get_alu_src(ctx, instr->src[0]),
1467 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1468 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1469 } else {
1470 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1471 get_alu_src(ctx, instr->src[0]),
1472 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1473 }
1474 } else {
1475 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1476 nir_print_instr(&instr->instr, stderr);
1477 fprintf(stderr, "\n");
1478 }
1479 break;
1480 }
1481 case nir_op_fmax3: {
1482 if (dst.size() == 1) {
1483 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1484 } else {
1485 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1486 nir_print_instr(&instr->instr, stderr);
1487 fprintf(stderr, "\n");
1488 }
1489 break;
1490 }
1491 case nir_op_fmin3: {
1492 if (dst.size() == 1) {
1493 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1494 } else {
1495 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1496 nir_print_instr(&instr->instr, stderr);
1497 fprintf(stderr, "\n");
1498 }
1499 break;
1500 }
1501 case nir_op_fmed3: {
1502 if (dst.size() == 1) {
1503 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1504 } else {
1505 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1506 nir_print_instr(&instr->instr, stderr);
1507 fprintf(stderr, "\n");
1508 }
1509 break;
1510 }
1511 case nir_op_umax3: {
1512 if (dst.size() == 1) {
1513 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1514 } else {
1515 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1516 nir_print_instr(&instr->instr, stderr);
1517 fprintf(stderr, "\n");
1518 }
1519 break;
1520 }
1521 case nir_op_umin3: {
1522 if (dst.size() == 1) {
1523 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1524 } else {
1525 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1526 nir_print_instr(&instr->instr, stderr);
1527 fprintf(stderr, "\n");
1528 }
1529 break;
1530 }
1531 case nir_op_umed3: {
1532 if (dst.size() == 1) {
1533 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1534 } else {
1535 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1536 nir_print_instr(&instr->instr, stderr);
1537 fprintf(stderr, "\n");
1538 }
1539 break;
1540 }
1541 case nir_op_imax3: {
1542 if (dst.size() == 1) {
1543 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1544 } else {
1545 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1546 nir_print_instr(&instr->instr, stderr);
1547 fprintf(stderr, "\n");
1548 }
1549 break;
1550 }
1551 case nir_op_imin3: {
1552 if (dst.size() == 1) {
1553 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1554 } else {
1555 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1556 nir_print_instr(&instr->instr, stderr);
1557 fprintf(stderr, "\n");
1558 }
1559 break;
1560 }
1561 case nir_op_imed3: {
1562 if (dst.size() == 1) {
1563 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1564 } else {
1565 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1566 nir_print_instr(&instr->instr, stderr);
1567 fprintf(stderr, "\n");
1568 }
1569 break;
1570 }
1571 case nir_op_cube_face_coord: {
1572 Temp in = get_alu_src(ctx, instr->src[0], 3);
1573 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1574 emit_extract_vector(ctx, in, 1, v1),
1575 emit_extract_vector(ctx, in, 2, v1) };
1576 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1577 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1578 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1579 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1580 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1581 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1582 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1583 break;
1584 }
1585 case nir_op_cube_face_index: {
1586 Temp in = get_alu_src(ctx, instr->src[0], 3);
1587 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1588 emit_extract_vector(ctx, in, 1, v1),
1589 emit_extract_vector(ctx, in, 2, v1) };
1590 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1591 break;
1592 }
1593 case nir_op_bcsel: {
1594 emit_bcsel(ctx, instr, dst);
1595 break;
1596 }
1597 case nir_op_frsq: {
1598 if (dst.size() == 1) {
1599 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1600 } else if (dst.size() == 2) {
1601 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1602 } else {
1603 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1604 nir_print_instr(&instr->instr, stderr);
1605 fprintf(stderr, "\n");
1606 }
1607 break;
1608 }
1609 case nir_op_fneg: {
1610 Temp src = get_alu_src(ctx, instr->src[0]);
1611 if (dst.size() == 1) {
1612 if (ctx->block->fp_mode.must_flush_denorms32)
1613 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1614 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1615 } else if (dst.size() == 2) {
1616 if (ctx->block->fp_mode.must_flush_denorms16_64)
1617 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1618 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1619 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1620 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1621 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1622 } else {
1623 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1624 nir_print_instr(&instr->instr, stderr);
1625 fprintf(stderr, "\n");
1626 }
1627 break;
1628 }
1629 case nir_op_fabs: {
1630 Temp src = get_alu_src(ctx, instr->src[0]);
1631 if (dst.size() == 1) {
1632 if (ctx->block->fp_mode.must_flush_denorms32)
1633 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1634 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1635 } else if (dst.size() == 2) {
1636 if (ctx->block->fp_mode.must_flush_denorms16_64)
1637 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1638 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1639 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1640 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1641 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1642 } else {
1643 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1644 nir_print_instr(&instr->instr, stderr);
1645 fprintf(stderr, "\n");
1646 }
1647 break;
1648 }
1649 case nir_op_fsat: {
1650 Temp src = get_alu_src(ctx, instr->src[0]);
1651 if (dst.size() == 1) {
1652 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1653 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1654 // TODO: confirm that this holds under any circumstances
1655 } else if (dst.size() == 2) {
1656 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1657 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1658 vop3->clamp = true;
1659 } else {
1660 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1661 nir_print_instr(&instr->instr, stderr);
1662 fprintf(stderr, "\n");
1663 }
1664 break;
1665 }
1666 case nir_op_flog2: {
1667 if (dst.size() == 1) {
1668 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1669 } else {
1670 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1671 nir_print_instr(&instr->instr, stderr);
1672 fprintf(stderr, "\n");
1673 }
1674 break;
1675 }
1676 case nir_op_frcp: {
1677 if (dst.size() == 1) {
1678 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1679 } else if (dst.size() == 2) {
1680 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1681 } else {
1682 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1683 nir_print_instr(&instr->instr, stderr);
1684 fprintf(stderr, "\n");
1685 }
1686 break;
1687 }
1688 case nir_op_fexp2: {
1689 if (dst.size() == 1) {
1690 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1691 } else {
1692 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1693 nir_print_instr(&instr->instr, stderr);
1694 fprintf(stderr, "\n");
1695 }
1696 break;
1697 }
1698 case nir_op_fsqrt: {
1699 if (dst.size() == 1) {
1700 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1701 } else if (dst.size() == 2) {
1702 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1703 } else {
1704 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1705 nir_print_instr(&instr->instr, stderr);
1706 fprintf(stderr, "\n");
1707 }
1708 break;
1709 }
1710 case nir_op_ffract: {
1711 if (dst.size() == 1) {
1712 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1713 } else if (dst.size() == 2) {
1714 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1715 } else {
1716 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1717 nir_print_instr(&instr->instr, stderr);
1718 fprintf(stderr, "\n");
1719 }
1720 break;
1721 }
1722 case nir_op_ffloor: {
1723 if (dst.size() == 1) {
1724 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1725 } else if (dst.size() == 2) {
1726 emit_floor_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1727 } else {
1728 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1729 nir_print_instr(&instr->instr, stderr);
1730 fprintf(stderr, "\n");
1731 }
1732 break;
1733 }
1734 case nir_op_fceil: {
1735 if (dst.size() == 1) {
1736 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1737 } else if (dst.size() == 2) {
1738 if (ctx->options->chip_class >= GFX7) {
1739 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1740 } else {
1741 /* GFX6 doesn't support V_CEIL_F64, lower it. */
1742 Temp src0 = get_alu_src(ctx, instr->src[0]);
1743
1744 /* trunc = trunc(src0)
1745 * if (src0 > 0.0 && src0 != trunc)
1746 * trunc += 1.0
1747 */
1748 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
1749 Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
1750 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
1751 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
1752 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
1753 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
1754 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
1755 }
1756 } else {
1757 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1758 nir_print_instr(&instr->instr, stderr);
1759 fprintf(stderr, "\n");
1760 }
1761 break;
1762 }
1763 case nir_op_ftrunc: {
1764 if (dst.size() == 1) {
1765 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1766 } else if (dst.size() == 2) {
1767 emit_trunc_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1768 } else {
1769 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1770 nir_print_instr(&instr->instr, stderr);
1771 fprintf(stderr, "\n");
1772 }
1773 break;
1774 }
1775 case nir_op_fround_even: {
1776 if (dst.size() == 1) {
1777 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1778 } else if (dst.size() == 2) {
1779 if (ctx->options->chip_class >= GFX7) {
1780 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1781 } else {
1782 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
1783 Temp src0 = get_alu_src(ctx, instr->src[0]);
1784
1785 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
1786 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
1787
1788 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
1789 Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
1790 Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
1791 Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
1792 static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
1793 tmp = sub->definitions[0].getTemp();
1794
1795 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
1796 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
1797 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
1798 Temp cond = vop3->definitions[0].getTemp();
1799
1800 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
1801 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
1802 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
1803 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
1804
1805 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1806 }
1807 } else {
1808 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1809 nir_print_instr(&instr->instr, stderr);
1810 fprintf(stderr, "\n");
1811 }
1812 break;
1813 }
1814 case nir_op_fsin:
1815 case nir_op_fcos: {
1816 Temp src = get_alu_src(ctx, instr->src[0]);
1817 aco_ptr<Instruction> norm;
1818 if (dst.size() == 1) {
1819 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
1820 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
1821
1822 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1823 if (ctx->options->chip_class < GFX9)
1824 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1825
1826 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1827 bld.vop1(opcode, Definition(dst), tmp);
1828 } else {
1829 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1830 nir_print_instr(&instr->instr, stderr);
1831 fprintf(stderr, "\n");
1832 }
1833 break;
1834 }
1835 case nir_op_ldexp: {
1836 if (dst.size() == 1) {
1837 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1838 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1839 get_alu_src(ctx, instr->src[1]));
1840 } else if (dst.size() == 2) {
1841 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1842 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1843 get_alu_src(ctx, instr->src[1]));
1844 } else {
1845 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1846 nir_print_instr(&instr->instr, stderr);
1847 fprintf(stderr, "\n");
1848 }
1849 break;
1850 }
1851 case nir_op_frexp_sig: {
1852 if (dst.size() == 1) {
1853 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1854 get_alu_src(ctx, instr->src[0]));
1855 } else if (dst.size() == 2) {
1856 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1857 get_alu_src(ctx, instr->src[0]));
1858 } else {
1859 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1860 nir_print_instr(&instr->instr, stderr);
1861 fprintf(stderr, "\n");
1862 }
1863 break;
1864 }
1865 case nir_op_frexp_exp: {
1866 if (instr->src[0].src.ssa->bit_size == 32) {
1867 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1868 get_alu_src(ctx, instr->src[0]));
1869 } else if (instr->src[0].src.ssa->bit_size == 64) {
1870 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1871 get_alu_src(ctx, instr->src[0]));
1872 } else {
1873 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1874 nir_print_instr(&instr->instr, stderr);
1875 fprintf(stderr, "\n");
1876 }
1877 break;
1878 }
1879 case nir_op_fsign: {
1880 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1881 if (dst.size() == 1) {
1882 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1883 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1884 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1885 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1886 } else if (dst.size() == 2) {
1887 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1888 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1889 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1890
1891 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1892 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1893 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1894
1895 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1896 } else {
1897 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1898 nir_print_instr(&instr->instr, stderr);
1899 fprintf(stderr, "\n");
1900 }
1901 break;
1902 }
1903 case nir_op_f2f32: {
1904 if (instr->src[0].src.ssa->bit_size == 64) {
1905 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1906 } else {
1907 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1908 nir_print_instr(&instr->instr, stderr);
1909 fprintf(stderr, "\n");
1910 }
1911 break;
1912 }
1913 case nir_op_f2f64: {
1914 if (instr->src[0].src.ssa->bit_size == 32) {
1915 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1916 } else {
1917 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1918 nir_print_instr(&instr->instr, stderr);
1919 fprintf(stderr, "\n");
1920 }
1921 break;
1922 }
1923 case nir_op_i2f32: {
1924 assert(dst.size() == 1);
1925 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1926 break;
1927 }
1928 case nir_op_i2f64: {
1929 if (instr->src[0].src.ssa->bit_size == 32) {
1930 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1931 } else if (instr->src[0].src.ssa->bit_size == 64) {
1932 Temp src = get_alu_src(ctx, instr->src[0]);
1933 RegClass rc = RegClass(src.type(), 1);
1934 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1935 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1936 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1937 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1938 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1939 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1940
1941 } else {
1942 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1943 nir_print_instr(&instr->instr, stderr);
1944 fprintf(stderr, "\n");
1945 }
1946 break;
1947 }
1948 case nir_op_u2f32: {
1949 assert(dst.size() == 1);
1950 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1951 break;
1952 }
1953 case nir_op_u2f64: {
1954 if (instr->src[0].src.ssa->bit_size == 32) {
1955 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1956 } else if (instr->src[0].src.ssa->bit_size == 64) {
1957 Temp src = get_alu_src(ctx, instr->src[0]);
1958 RegClass rc = RegClass(src.type(), 1);
1959 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1960 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1961 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1962 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1963 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1964 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1965 } else {
1966 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1967 nir_print_instr(&instr->instr, stderr);
1968 fprintf(stderr, "\n");
1969 }
1970 break;
1971 }
1972 case nir_op_f2i32: {
1973 Temp src = get_alu_src(ctx, instr->src[0]);
1974 if (instr->src[0].src.ssa->bit_size == 32) {
1975 if (dst.type() == RegType::vgpr)
1976 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1977 else
1978 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1979 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1980
1981 } else if (instr->src[0].src.ssa->bit_size == 64) {
1982 if (dst.type() == RegType::vgpr)
1983 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1984 else
1985 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1986 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1987
1988 } else {
1989 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1990 nir_print_instr(&instr->instr, stderr);
1991 fprintf(stderr, "\n");
1992 }
1993 break;
1994 }
1995 case nir_op_f2u32: {
1996 Temp src = get_alu_src(ctx, instr->src[0]);
1997 if (instr->src[0].src.ssa->bit_size == 32) {
1998 if (dst.type() == RegType::vgpr)
1999 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
2000 else
2001 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2002 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
2003
2004 } else if (instr->src[0].src.ssa->bit_size == 64) {
2005 if (dst.type() == RegType::vgpr)
2006 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
2007 else
2008 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2009 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
2010
2011 } else {
2012 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2013 nir_print_instr(&instr->instr, stderr);
2014 fprintf(stderr, "\n");
2015 }
2016 break;
2017 }
2018 case nir_op_f2i64: {
2019 Temp src = get_alu_src(ctx, instr->src[0]);
2020 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
2021 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2022 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2023 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2024 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2025 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2026 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2027 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2028 Temp new_exponent = bld.tmp(v1);
2029 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2030 if (ctx->program->chip_class >= GFX8)
2031 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2032 else
2033 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2034 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2035 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2036 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2037 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2038 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2039 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2040 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2041 Temp new_lower = bld.tmp(v1);
2042 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2043 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2044 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2045
2046 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
2047 if (src.type() == RegType::vgpr)
2048 src = bld.as_uniform(src);
2049 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2050 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2051 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2052 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2053 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2054 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2055 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2056 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2057 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2058 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2059 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2060 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2061 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2062 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2063 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2064 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2065 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2066 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2067 Temp borrow = bld.tmp(s1);
2068 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2069 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2070 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2071
2072 } else if (instr->src[0].src.ssa->bit_size == 64) {
2073 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2074 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2075 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2076 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2077 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2078 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2079 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2080 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2081 if (dst.type() == RegType::sgpr) {
2082 lower = bld.as_uniform(lower);
2083 upper = bld.as_uniform(upper);
2084 }
2085 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2086
2087 } else {
2088 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2089 nir_print_instr(&instr->instr, stderr);
2090 fprintf(stderr, "\n");
2091 }
2092 break;
2093 }
2094 case nir_op_f2u64: {
2095 Temp src = get_alu_src(ctx, instr->src[0]);
2096 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
2097 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2098 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2099 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2100 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2101 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2102 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2103 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2104 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2105 Temp new_exponent = bld.tmp(v1);
2106 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2107 if (ctx->program->chip_class >= GFX8)
2108 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2109 else
2110 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2111 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2112 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2113 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2114 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2115 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2116 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2117 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2118
2119 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
2120 if (src.type() == RegType::vgpr)
2121 src = bld.as_uniform(src);
2122 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2123 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2124 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2125 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2126 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2127 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2128 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2129 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2130 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2131 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2132 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2133 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2134 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2135 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2136 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2137 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2138 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2139 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2140
2141 } else if (instr->src[0].src.ssa->bit_size == 64) {
2142 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2143 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2144 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2145 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2146 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2147 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2148 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2149 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2150 if (dst.type() == RegType::sgpr) {
2151 lower = bld.as_uniform(lower);
2152 upper = bld.as_uniform(upper);
2153 }
2154 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2155
2156 } else {
2157 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2158 nir_print_instr(&instr->instr, stderr);
2159 fprintf(stderr, "\n");
2160 }
2161 break;
2162 }
2163 case nir_op_b2f32: {
2164 Temp src = get_alu_src(ctx, instr->src[0]);
2165 assert(src.regClass() == bld.lm);
2166
2167 if (dst.regClass() == s1) {
2168 src = bool_to_scalar_condition(ctx, src);
2169 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2170 } else if (dst.regClass() == v1) {
2171 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2172 } else {
2173 unreachable("Wrong destination register class for nir_op_b2f32.");
2174 }
2175 break;
2176 }
2177 case nir_op_b2f64: {
2178 Temp src = get_alu_src(ctx, instr->src[0]);
2179 assert(src.regClass() == bld.lm);
2180
2181 if (dst.regClass() == s2) {
2182 src = bool_to_scalar_condition(ctx, src);
2183 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2184 } else if (dst.regClass() == v2) {
2185 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2186 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2187 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2188 } else {
2189 unreachable("Wrong destination register class for nir_op_b2f64.");
2190 }
2191 break;
2192 }
2193 case nir_op_i2i32: {
2194 Temp src = get_alu_src(ctx, instr->src[0]);
2195 if (instr->src[0].src.ssa->bit_size == 64) {
2196 /* we can actually just say dst = src, as it would map the lower register */
2197 emit_extract_vector(ctx, src, 0, dst);
2198 } else {
2199 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2200 nir_print_instr(&instr->instr, stderr);
2201 fprintf(stderr, "\n");
2202 }
2203 break;
2204 }
2205 case nir_op_u2u32: {
2206 Temp src = get_alu_src(ctx, instr->src[0]);
2207 if (instr->src[0].src.ssa->bit_size == 16) {
2208 if (dst.regClass() == s1) {
2209 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2210 } else {
2211 // TODO: do better with SDWA
2212 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2213 }
2214 } else if (instr->src[0].src.ssa->bit_size == 64) {
2215 /* we can actually just say dst = src, as it would map the lower register */
2216 emit_extract_vector(ctx, src, 0, dst);
2217 } else {
2218 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2219 nir_print_instr(&instr->instr, stderr);
2220 fprintf(stderr, "\n");
2221 }
2222 break;
2223 }
2224 case nir_op_i2i64: {
2225 Temp src = get_alu_src(ctx, instr->src[0]);
2226 if (src.regClass() == s1) {
2227 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2228 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2229 } else if (src.regClass() == v1) {
2230 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2231 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2232 } else {
2233 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2234 nir_print_instr(&instr->instr, stderr);
2235 fprintf(stderr, "\n");
2236 }
2237 break;
2238 }
2239 case nir_op_u2u64: {
2240 Temp src = get_alu_src(ctx, instr->src[0]);
2241 if (instr->src[0].src.ssa->bit_size == 32) {
2242 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2243 } else {
2244 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2245 nir_print_instr(&instr->instr, stderr);
2246 fprintf(stderr, "\n");
2247 }
2248 break;
2249 }
2250 case nir_op_b2i32: {
2251 Temp src = get_alu_src(ctx, instr->src[0]);
2252 assert(src.regClass() == bld.lm);
2253
2254 if (dst.regClass() == s1) {
2255 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2256 bool_to_scalar_condition(ctx, src, dst);
2257 } else if (dst.regClass() == v1) {
2258 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2259 } else {
2260 unreachable("Invalid register class for b2i32");
2261 }
2262 break;
2263 }
2264 case nir_op_i2b1: {
2265 Temp src = get_alu_src(ctx, instr->src[0]);
2266 assert(dst.regClass() == bld.lm);
2267
2268 if (src.type() == RegType::vgpr) {
2269 assert(src.regClass() == v1 || src.regClass() == v2);
2270 assert(dst.regClass() == bld.lm);
2271 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2272 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2273 } else {
2274 assert(src.regClass() == s1 || src.regClass() == s2);
2275 Temp tmp;
2276 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2277 tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2278 } else {
2279 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2280 bld.scc(bld.def(s1)), Operand(0u), src);
2281 }
2282 bool_to_vector_condition(ctx, tmp, dst);
2283 }
2284 break;
2285 }
2286 case nir_op_pack_64_2x32_split: {
2287 Temp src0 = get_alu_src(ctx, instr->src[0]);
2288 Temp src1 = get_alu_src(ctx, instr->src[1]);
2289
2290 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2291 break;
2292 }
2293 case nir_op_unpack_64_2x32_split_x:
2294 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2295 break;
2296 case nir_op_unpack_64_2x32_split_y:
2297 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2298 break;
2299 case nir_op_pack_half_2x16: {
2300 Temp src = get_alu_src(ctx, instr->src[0], 2);
2301
2302 if (dst.regClass() == v1) {
2303 Temp src0 = bld.tmp(v1);
2304 Temp src1 = bld.tmp(v1);
2305 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2306 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2307 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2308 else
2309 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2310 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2311 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2312 } else {
2313 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2314 nir_print_instr(&instr->instr, stderr);
2315 fprintf(stderr, "\n");
2316 }
2317 break;
2318 }
2319 case nir_op_unpack_half_2x16_split_x: {
2320 if (dst.regClass() == v1) {
2321 Builder bld(ctx->program, ctx->block);
2322 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2323 } else {
2324 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2325 nir_print_instr(&instr->instr, stderr);
2326 fprintf(stderr, "\n");
2327 }
2328 break;
2329 }
2330 case nir_op_unpack_half_2x16_split_y: {
2331 if (dst.regClass() == v1) {
2332 Builder bld(ctx->program, ctx->block);
2333 /* TODO: use SDWA here */
2334 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2335 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2336 } else {
2337 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2338 nir_print_instr(&instr->instr, stderr);
2339 fprintf(stderr, "\n");
2340 }
2341 break;
2342 }
2343 case nir_op_fquantize2f16: {
2344 Temp src = get_alu_src(ctx, instr->src[0]);
2345 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2346 Temp f32, cmp_res;
2347
2348 if (ctx->program->chip_class >= GFX8) {
2349 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2350 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2351 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2352 } else {
2353 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2354 * so compare the result and flush to 0 if it's smaller.
2355 */
2356 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2357 Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2358 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2359 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2360 cmp_res = vop3->definitions[0].getTemp();
2361 }
2362
2363 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2364 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2365 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2366 } else {
2367 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2368 }
2369 break;
2370 }
2371 case nir_op_bfm: {
2372 Temp bits = get_alu_src(ctx, instr->src[0]);
2373 Temp offset = get_alu_src(ctx, instr->src[1]);
2374
2375 if (dst.regClass() == s1) {
2376 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2377 } else if (dst.regClass() == v1) {
2378 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2379 } else {
2380 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2381 nir_print_instr(&instr->instr, stderr);
2382 fprintf(stderr, "\n");
2383 }
2384 break;
2385 }
2386 case nir_op_bitfield_select: {
2387 /* (mask & insert) | (~mask & base) */
2388 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2389 Temp insert = get_alu_src(ctx, instr->src[1]);
2390 Temp base = get_alu_src(ctx, instr->src[2]);
2391
2392 /* dst = (insert & bitmask) | (base & ~bitmask) */
2393 if (dst.regClass() == s1) {
2394 aco_ptr<Instruction> sop2;
2395 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2396 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2397 Operand lhs;
2398 if (const_insert && const_bitmask) {
2399 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2400 } else {
2401 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2402 lhs = Operand(insert);
2403 }
2404
2405 Operand rhs;
2406 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2407 if (const_base && const_bitmask) {
2408 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2409 } else {
2410 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2411 rhs = Operand(base);
2412 }
2413
2414 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2415
2416 } else if (dst.regClass() == v1) {
2417 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2418 base = as_vgpr(ctx, base);
2419 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2420 insert = as_vgpr(ctx, insert);
2421
2422 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2423
2424 } else {
2425 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2426 nir_print_instr(&instr->instr, stderr);
2427 fprintf(stderr, "\n");
2428 }
2429 break;
2430 }
2431 case nir_op_ubfe:
2432 case nir_op_ibfe: {
2433 Temp base = get_alu_src(ctx, instr->src[0]);
2434 Temp offset = get_alu_src(ctx, instr->src[1]);
2435 Temp bits = get_alu_src(ctx, instr->src[2]);
2436
2437 if (dst.type() == RegType::sgpr) {
2438 Operand extract;
2439 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2440 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2441 if (const_offset && const_bits) {
2442 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2443 extract = Operand(const_extract);
2444 } else {
2445 Operand width;
2446 if (const_bits) {
2447 width = Operand(const_bits->u32 << 16);
2448 } else {
2449 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2450 }
2451 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2452 }
2453
2454 aco_opcode opcode;
2455 if (dst.regClass() == s1) {
2456 if (instr->op == nir_op_ubfe)
2457 opcode = aco_opcode::s_bfe_u32;
2458 else
2459 opcode = aco_opcode::s_bfe_i32;
2460 } else if (dst.regClass() == s2) {
2461 if (instr->op == nir_op_ubfe)
2462 opcode = aco_opcode::s_bfe_u64;
2463 else
2464 opcode = aco_opcode::s_bfe_i64;
2465 } else {
2466 unreachable("Unsupported BFE bit size");
2467 }
2468
2469 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2470
2471 } else {
2472 aco_opcode opcode;
2473 if (dst.regClass() == v1) {
2474 if (instr->op == nir_op_ubfe)
2475 opcode = aco_opcode::v_bfe_u32;
2476 else
2477 opcode = aco_opcode::v_bfe_i32;
2478 } else {
2479 unreachable("Unsupported BFE bit size");
2480 }
2481
2482 emit_vop3a_instruction(ctx, instr, opcode, dst);
2483 }
2484 break;
2485 }
2486 case nir_op_bit_count: {
2487 Temp src = get_alu_src(ctx, instr->src[0]);
2488 if (src.regClass() == s1) {
2489 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2490 } else if (src.regClass() == v1) {
2491 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2492 } else if (src.regClass() == v2) {
2493 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2494 emit_extract_vector(ctx, src, 1, v1),
2495 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2496 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2497 } else if (src.regClass() == s2) {
2498 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2499 } else {
2500 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2501 nir_print_instr(&instr->instr, stderr);
2502 fprintf(stderr, "\n");
2503 }
2504 break;
2505 }
2506 case nir_op_flt: {
2507 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2508 break;
2509 }
2510 case nir_op_fge: {
2511 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2512 break;
2513 }
2514 case nir_op_feq: {
2515 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2516 break;
2517 }
2518 case nir_op_fne: {
2519 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2520 break;
2521 }
2522 case nir_op_ilt: {
2523 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2524 break;
2525 }
2526 case nir_op_ige: {
2527 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2528 break;
2529 }
2530 case nir_op_ieq: {
2531 if (instr->src[0].src.ssa->bit_size == 1)
2532 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
2533 else
2534 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
2535 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
2536 break;
2537 }
2538 case nir_op_ine: {
2539 if (instr->src[0].src.ssa->bit_size == 1)
2540 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
2541 else
2542 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
2543 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
2544 break;
2545 }
2546 case nir_op_ult: {
2547 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2548 break;
2549 }
2550 case nir_op_uge: {
2551 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2552 break;
2553 }
2554 case nir_op_fddx:
2555 case nir_op_fddy:
2556 case nir_op_fddx_fine:
2557 case nir_op_fddy_fine:
2558 case nir_op_fddx_coarse:
2559 case nir_op_fddy_coarse: {
2560 Temp src = get_alu_src(ctx, instr->src[0]);
2561 uint16_t dpp_ctrl1, dpp_ctrl2;
2562 if (instr->op == nir_op_fddx_fine) {
2563 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
2564 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
2565 } else if (instr->op == nir_op_fddy_fine) {
2566 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
2567 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
2568 } else {
2569 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
2570 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2571 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
2572 else
2573 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
2574 }
2575
2576 Temp tmp;
2577 if (ctx->program->chip_class >= GFX8) {
2578 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
2579 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
2580 } else {
2581 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
2582 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
2583 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
2584 }
2585 emit_wqm(ctx, tmp, dst, true);
2586 break;
2587 }
2588 default:
2589 fprintf(stderr, "Unknown NIR ALU instr: ");
2590 nir_print_instr(&instr->instr, stderr);
2591 fprintf(stderr, "\n");
2592 }
2593 }
2594
2595 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2596 {
2597 Temp dst = get_ssa_temp(ctx, &instr->def);
2598
2599 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2600 // which get truncated the lsb if double and msb if int
2601 // for now, we only use s_mov_b64 with 64bit inline constants
2602 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2603 assert(dst.type() == RegType::sgpr);
2604
2605 Builder bld(ctx->program, ctx->block);
2606
2607 if (instr->def.bit_size == 1) {
2608 assert(dst.regClass() == bld.lm);
2609 int val = instr->value[0].b ? -1 : 0;
2610 Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
2611 bld.sop1(Builder::s_mov, Definition(dst), op);
2612 } else if (dst.size() == 1) {
2613 bld.copy(Definition(dst), Operand(instr->value[0].u32));
2614 } else {
2615 assert(dst.size() != 1);
2616 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2617 if (instr->def.bit_size == 64)
2618 for (unsigned i = 0; i < dst.size(); i++)
2619 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2620 else {
2621 for (unsigned i = 0; i < dst.size(); i++)
2622 vec->operands[i] = Operand{instr->value[i].u32};
2623 }
2624 vec->definitions[0] = Definition(dst);
2625 ctx->block->instructions.emplace_back(std::move(vec));
2626 }
2627 }
2628
2629 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2630 {
2631 uint32_t new_mask = 0;
2632 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2633 if (mask & (1u << i))
2634 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2635 return new_mask;
2636 }
2637
2638 Operand load_lds_size_m0(isel_context *ctx)
2639 {
2640 /* TODO: m0 does not need to be initialized on GFX9+ */
2641 Builder bld(ctx->program, ctx->block);
2642 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2643 }
2644
2645 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2646 Temp address, unsigned base_offset, unsigned align)
2647 {
2648 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2649
2650 Builder bld(ctx->program, ctx->block);
2651
2652 Operand m = load_lds_size_m0(ctx);
2653
2654 unsigned num_components = dst.size() * 4u / elem_size_bytes;
2655 unsigned bytes_read = 0;
2656 unsigned result_size = 0;
2657 unsigned total_bytes = num_components * elem_size_bytes;
2658 std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
2659 bool large_ds_read = ctx->options->chip_class >= GFX7;
2660 bool usable_read2 = ctx->options->chip_class >= GFX7;
2661
2662 while (bytes_read < total_bytes) {
2663 unsigned todo = total_bytes - bytes_read;
2664 bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2665 bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2666
2667 aco_opcode op = aco_opcode::last_opcode;
2668 bool read2 = false;
2669 if (todo >= 16 && aligned16 && large_ds_read) {
2670 op = aco_opcode::ds_read_b128;
2671 todo = 16;
2672 } else if (todo >= 16 && aligned8 && usable_read2) {
2673 op = aco_opcode::ds_read2_b64;
2674 read2 = true;
2675 todo = 16;
2676 } else if (todo >= 12 && aligned16 && large_ds_read) {
2677 op = aco_opcode::ds_read_b96;
2678 todo = 12;
2679 } else if (todo >= 8 && aligned8) {
2680 op = aco_opcode::ds_read_b64;
2681 todo = 8;
2682 } else if (todo >= 8 && usable_read2) {
2683 op = aco_opcode::ds_read2_b32;
2684 read2 = true;
2685 todo = 8;
2686 } else if (todo >= 4) {
2687 op = aco_opcode::ds_read_b32;
2688 todo = 4;
2689 } else {
2690 assert(false);
2691 }
2692 assert(todo % elem_size_bytes == 0);
2693 unsigned num_elements = todo / elem_size_bytes;
2694 unsigned offset = base_offset + bytes_read;
2695 unsigned max_offset = read2 ? 1019 : 65535;
2696
2697 Temp address_offset = address;
2698 if (offset > max_offset) {
2699 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2700 offset = bytes_read;
2701 }
2702 assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2703
2704 Temp res;
2705 if (num_components == 1 && dst.type() == RegType::vgpr)
2706 res = dst;
2707 else
2708 res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2709
2710 if (read2)
2711 res = bld.ds(op, Definition(res), address_offset, m, offset / (todo / 2), (offset / (todo / 2)) + 1);
2712 else
2713 res = bld.ds(op, Definition(res), address_offset, m, offset);
2714
2715 if (num_components == 1) {
2716 assert(todo == total_bytes);
2717 if (dst.type() == RegType::sgpr)
2718 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2719 return dst;
2720 }
2721
2722 if (dst.type() == RegType::sgpr) {
2723 Temp new_res = bld.tmp(RegType::sgpr, res.size());
2724 expand_vector(ctx, res, new_res, res.size(), (1 << res.size()) - 1);
2725 res = new_res;
2726 }
2727
2728 if (num_elements == 1) {
2729 result[result_size++] = res;
2730 } else {
2731 assert(res != dst && res.size() % num_elements == 0);
2732 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2733 split->operands[0] = Operand(res);
2734 for (unsigned i = 0; i < num_elements; i++)
2735 split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2736 ctx->block->instructions.emplace_back(std::move(split));
2737 }
2738
2739 bytes_read += todo;
2740 }
2741
2742 assert(result_size == num_components && result_size > 1);
2743 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2744 for (unsigned i = 0; i < result_size; i++)
2745 vec->operands[i] = Operand(result[i]);
2746 vec->definitions[0] = Definition(dst);
2747 ctx->block->instructions.emplace_back(std::move(vec));
2748 ctx->allocated_vec.emplace(dst.id(), result);
2749
2750 return dst;
2751 }
2752
2753 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2754 {
2755 if (start == 0 && size == data.size())
2756 return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2757
2758 unsigned size_hint = 1;
2759 auto it = ctx->allocated_vec.find(data.id());
2760 if (it != ctx->allocated_vec.end())
2761 size_hint = it->second[0].size();
2762 if (size % size_hint || start % size_hint)
2763 size_hint = 1;
2764
2765 start /= size_hint;
2766 size /= size_hint;
2767
2768 Temp elems[size];
2769 for (unsigned i = 0; i < size; i++)
2770 elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2771
2772 if (size == 1)
2773 return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2774
2775 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2776 for (unsigned i = 0; i < size; i++)
2777 vec->operands[i] = Operand(elems[i]);
2778 Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2779 vec->definitions[0] = Definition(res);
2780 ctx->block->instructions.emplace_back(std::move(vec));
2781 return res;
2782 }
2783
2784 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2785 {
2786 Builder bld(ctx->program, ctx->block);
2787 unsigned bytes_written = 0;
2788 bool large_ds_write = ctx->options->chip_class >= GFX7;
2789 bool usable_write2 = ctx->options->chip_class >= GFX7;
2790
2791 while (bytes_written < total_size * 4) {
2792 unsigned todo = total_size * 4 - bytes_written;
2793 bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2794 bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2795
2796 aco_opcode op = aco_opcode::last_opcode;
2797 bool write2 = false;
2798 unsigned size = 0;
2799 if (todo >= 16 && aligned16 && large_ds_write) {
2800 op = aco_opcode::ds_write_b128;
2801 size = 4;
2802 } else if (todo >= 16 && aligned8 && usable_write2) {
2803 op = aco_opcode::ds_write2_b64;
2804 write2 = true;
2805 size = 4;
2806 } else if (todo >= 12 && aligned16 && large_ds_write) {
2807 op = aco_opcode::ds_write_b96;
2808 size = 3;
2809 } else if (todo >= 8 && aligned8) {
2810 op = aco_opcode::ds_write_b64;
2811 size = 2;
2812 } else if (todo >= 8 && usable_write2) {
2813 op = aco_opcode::ds_write2_b32;
2814 write2 = true;
2815 size = 2;
2816 } else if (todo >= 4) {
2817 op = aco_opcode::ds_write_b32;
2818 size = 1;
2819 } else {
2820 assert(false);
2821 }
2822
2823 unsigned offset = offset0 + offset1 + bytes_written;
2824 unsigned max_offset = write2 ? 1020 : 65535;
2825 Temp address_offset = address;
2826 if (offset > max_offset) {
2827 address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2828 offset = offset1 + bytes_written;
2829 }
2830 assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2831
2832 if (write2) {
2833 Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2834 Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2835 bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1);
2836 } else {
2837 Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2838 bld.ds(op, address_offset, val, m, offset);
2839 }
2840
2841 bytes_written += size * 4;
2842 }
2843 }
2844
2845 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2846 Temp address, unsigned base_offset, unsigned align)
2847 {
2848 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2849 assert(elem_size_bytes == 4 || elem_size_bytes == 8);
2850
2851 Operand m = load_lds_size_m0(ctx);
2852
2853 /* we need at most two stores, assuming that the writemask is at most 4 bits wide */
2854 assert(wrmask <= 0x0f);
2855 int start[2], count[2];
2856 u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2857 u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2858 assert(wrmask == 0);
2859
2860 /* one combined store is sufficient */
2861 if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) {
2862 Builder bld(ctx->program, ctx->block);
2863
2864 Temp address_offset = address;
2865 if ((base_offset / elem_size_bytes) + start[1] > 255) {
2866 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2867 base_offset = 0;
2868 }
2869
2870 assert(count[0] == 1);
2871 RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4);
2872
2873 Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc);
2874 Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc);
2875 aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2876 base_offset = base_offset / elem_size_bytes;
2877 bld.ds(op, address_offset, val0, val1, m,
2878 base_offset + start[0], base_offset + start[1]);
2879 return;
2880 }
2881
2882 for (unsigned i = 0; i < 2; i++) {
2883 if (count[i] == 0)
2884 continue;
2885
2886 unsigned elem_size_words = elem_size_bytes / 4;
2887 ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2888 base_offset, start[i] * elem_size_bytes, align);
2889 }
2890 return;
2891 }
2892
2893 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
2894 {
2895 unsigned align = 16;
2896 if (const_offset)
2897 align = std::min(align, 1u << (ffs(const_offset) - 1));
2898
2899 return align;
2900 }
2901
2902
2903 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned split_cnt = 0u, Temp dst = Temp())
2904 {
2905 Builder bld(ctx->program, ctx->block);
2906
2907 if (!dst.id())
2908 dst = bld.tmp(RegClass(reg_type, cnt * arr[0].size()));
2909
2910 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
2911 aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
2912 instr->definitions[0] = Definition(dst);
2913
2914 for (unsigned i = 0; i < cnt; ++i) {
2915 assert(arr[i].size() == arr[0].size());
2916 allocated_vec[i] = arr[i];
2917 instr->operands[i] = Operand(arr[i]);
2918 }
2919
2920 bld.insert(std::move(instr));
2921
2922 if (split_cnt)
2923 emit_split_vector(ctx, dst, split_cnt);
2924 else
2925 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
2926
2927 return dst;
2928 }
2929
2930 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
2931 {
2932 if (const_offset >= 4096) {
2933 unsigned excess_const_offset = const_offset / 4096u * 4096u;
2934 const_offset %= 4096u;
2935
2936 if (!voffset.id())
2937 voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
2938 else if (unlikely(voffset.regClass() == s1))
2939 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
2940 else if (likely(voffset.regClass() == v1))
2941 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
2942 else
2943 unreachable("Unsupported register class of voffset");
2944 }
2945
2946 return const_offset;
2947 }
2948
2949 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
2950 unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
2951 {
2952 assert(vdata.id());
2953 assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
2954 assert(vdata.size() >= 1 && vdata.size() <= 4);
2955
2956 Builder bld(ctx->program, ctx->block);
2957 aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
2958 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
2959
2960 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
2961 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
2962 Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
2963 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
2964 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
2965
2966 static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
2967 }
2968
2969 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
2970 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
2971 bool allow_combining = true, bool reorder = true, bool slc = false)
2972 {
2973 Builder bld(ctx->program, ctx->block);
2974 assert(elem_size_bytes == 4 || elem_size_bytes == 8);
2975 assert(write_mask);
2976
2977 if (elem_size_bytes == 8) {
2978 elem_size_bytes = 4;
2979 write_mask = widen_mask(write_mask, 2);
2980 }
2981
2982 while (write_mask) {
2983 int start = 0;
2984 int count = 0;
2985 u_bit_scan_consecutive_range(&write_mask, &start, &count);
2986 assert(count > 0);
2987 assert(start >= 0);
2988
2989 while (count > 0) {
2990 unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
2991 unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
2992
2993 /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
2994 if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
2995 sub_count = 2;
2996
2997 Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
2998 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
2999
3000 count -= sub_count;
3001 start += sub_count;
3002 }
3003
3004 assert(count == 0);
3005 }
3006 }
3007
3008 Temp emit_single_mubuf_load(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset,
3009 unsigned const_offset, unsigned size_dwords, bool allow_reorder = true)
3010 {
3011 assert(size_dwords != 3 || ctx->program->chip_class != GFX6);
3012 assert(size_dwords >= 1 && size_dwords <= 4);
3013
3014 Builder bld(ctx->program, ctx->block);
3015 Temp vdata = bld.tmp(RegClass(RegType::vgpr, size_dwords));
3016 aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_load_dword + size_dwords - 1);
3017 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
3018
3019 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
3020 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
3021 Builder::Result r = bld.mubuf(op, Definition(vdata), Operand(descriptor), voffset_op, soffset_op, const_offset,
3022 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
3023 /* disable_wqm */ false, /* glc */ true,
3024 /* dlc*/ ctx->program->chip_class >= GFX10, /* slc */ false);
3025
3026 static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
3027
3028 return vdata;
3029 }
3030
3031 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
3032 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
3033 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
3034 {
3035 assert(elem_size_bytes == 4 || elem_size_bytes == 8);
3036 assert((num_components * elem_size_bytes / 4) == dst.size());
3037 assert(!!stride != allow_combining);
3038
3039 Builder bld(ctx->program, ctx->block);
3040 unsigned split_cnt = num_components;
3041
3042 if (elem_size_bytes == 8) {
3043 elem_size_bytes = 4;
3044 num_components *= 2;
3045 }
3046
3047 if (!stride)
3048 stride = elem_size_bytes;
3049
3050 unsigned load_size = 1;
3051 if (allow_combining) {
3052 if ((num_components % 4) == 0)
3053 load_size = 4;
3054 else if ((num_components % 3) == 0 && ctx->program->chip_class != GFX6)
3055 load_size = 3;
3056 else if ((num_components % 2) == 0)
3057 load_size = 2;
3058 }
3059
3060 unsigned num_loads = num_components / load_size;
3061 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
3062
3063 for (unsigned i = 0; i < num_loads; ++i) {
3064 unsigned const_offset = i * stride * load_size + base_const_offset;
3065 elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
3066 }
3067
3068 create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, split_cnt, dst);
3069 }
3070
3071 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
3072 {
3073 Builder bld(ctx->program, ctx->block);
3074 Temp offset = base_offset.first;
3075 unsigned const_offset = base_offset.second;
3076
3077 if (!nir_src_is_const(*off_src)) {
3078 Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
3079 Temp with_stride;
3080
3081 /* Calculate indirect offset with stride */
3082 if (likely(indirect_offset_arg.regClass() == v1))
3083 with_stride = bld.v_mul_imm(bld.def(v1), indirect_offset_arg, stride);
3084 else if (indirect_offset_arg.regClass() == s1)
3085 with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
3086 else
3087 unreachable("Unsupported register class of indirect offset");
3088
3089 /* Add to the supplied base offset */
3090 if (offset.id() == 0)
3091 offset = with_stride;
3092 else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
3093 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
3094 else if (offset.size() == 1 && with_stride.size() == 1)
3095 offset = bld.vadd32(bld.def(v1), with_stride, offset);
3096 else
3097 unreachable("Unsupported register class of indirect offset");
3098 } else {
3099 unsigned const_offset_arg = nir_src_as_uint(*off_src);
3100 const_offset += const_offset_arg * stride;
3101 }
3102
3103 return std::make_pair(offset, const_offset);
3104 }
3105
3106 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
3107 {
3108 Builder bld(ctx->program, ctx->block);
3109 Temp offset;
3110
3111 if (off1.first.id() && off2.first.id()) {
3112 if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
3113 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
3114 else if (off1.first.size() == 1 && off2.first.size() == 1)
3115 offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
3116 else
3117 unreachable("Unsupported register class of indirect offset");
3118 } else {
3119 offset = off1.first.id() ? off1.first : off2.first;
3120 }
3121
3122 return std::make_pair(offset, off1.second + off2.second);
3123 }
3124
3125 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
3126 {
3127 Builder bld(ctx->program, ctx->block);
3128 unsigned const_offset = offs.second * multiplier;
3129
3130 if (!offs.first.id())
3131 return std::make_pair(offs.first, const_offset);
3132
3133 Temp offset = unlikely(offs.first.regClass() == s1)
3134 ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
3135 : bld.v_mul_imm(bld.def(v1), offs.first, multiplier);
3136
3137 return std::make_pair(offset, const_offset);
3138 }
3139
3140 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
3141 {
3142 Builder bld(ctx->program, ctx->block);
3143
3144 /* base is the driver_location, which is already multiplied by 4, so is in dwords */
3145 unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
3146 /* component is in bytes */
3147 const_offset += nir_intrinsic_component(instr) * component_stride;
3148
3149 /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
3150 nir_src *off_src = nir_get_io_offset_src(instr);
3151 return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
3152 }
3153
3154 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
3155 {
3156 return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
3157 }
3158
3159 Temp get_tess_rel_patch_id(isel_context *ctx)
3160 {
3161 Builder bld(ctx->program, ctx->block);
3162
3163 switch (ctx->shader->info.stage) {
3164 case MESA_SHADER_TESS_CTRL:
3165 return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
3166 get_arg(ctx, ctx->args->ac.tcs_rel_ids));
3167 case MESA_SHADER_TESS_EVAL:
3168 return get_arg(ctx, ctx->args->tes_rel_patch_id);
3169 default:
3170 unreachable("Unsupported stage in get_tess_rel_patch_id");
3171 }
3172 }
3173
3174 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
3175 {
3176 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3177 Builder bld(ctx->program, ctx->block);
3178
3179 uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
3180 uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
3181
3182 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
3183
3184 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3185 offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
3186
3187 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3188 Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
3189 offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
3190
3191 return offset_mul(ctx, offs, 4u);
3192 }
3193
3194 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
3195 {
3196 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3197 Builder bld(ctx->program, ctx->block);
3198
3199 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
3200 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
3201 uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
3202 uint32_t output_vertex_size = num_tcs_outputs * 16;
3203 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
3204 uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
3205
3206 std::pair<Temp, unsigned> offs = instr
3207 ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
3208 : std::make_pair(Temp(), 0u);
3209
3210 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3211 Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
3212
3213 if (per_vertex) {
3214 assert(instr);
3215
3216 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3217 offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
3218
3219 uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
3220 offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
3221 } else {
3222 uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
3223 offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
3224 }
3225
3226 return offs;
3227 }
3228
3229 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
3230 {
3231 Builder bld(ctx->program, ctx->block);
3232
3233 unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
3234 unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
3235
3236 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
3237
3238 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3239 Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
3240 offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
3241
3242 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3243 offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
3244
3245 return offs;
3246 }
3247
3248 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
3249 {
3250 Builder bld(ctx->program, ctx->block);
3251
3252 unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
3253 ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
3254 : ctx->args->options->key.tes.tcs_num_outputs;
3255
3256 unsigned output_vertex_size = num_tcs_outputs * 16;
3257 unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
3258 unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
3259 unsigned attr_stride = ctx->tcs_num_patches;
3260
3261 std::pair<Temp, unsigned> offs = instr
3262 ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
3263 : std::make_pair(Temp(), 0u);
3264
3265 if (const_base_offset)
3266 offs.second += const_base_offset * attr_stride;
3267
3268 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3269 Temp patch_off = bld.v_mul_imm(bld.def(v1), rel_patch_id, 16u);
3270 offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
3271
3272 return offs;
3273 }
3274
3275 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
3276 {
3277 Builder bld(ctx->program, ctx->block);
3278
3279 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
3280 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3281 unsigned write_mask = nir_intrinsic_write_mask(instr);
3282 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
3283
3284 if (ctx->stage == vertex_es) {
3285 /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
3286 Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
3287 Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
3288 store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
3289 } else {
3290 Temp lds_base;
3291
3292 if (ctx->stage == vertex_geometry_gs) {
3293 /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
3294 unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize;
3295 Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
3296 Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
3297 Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
3298 bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
3299 lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
3300 } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
3301 /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
3302 * GFX9+: LS is merged into HS, but still uses the same LDS layout.
3303 */
3304 unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
3305 Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
3306 lds_base = bld.v_mul_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
3307 } else {
3308 unreachable("Invalid LS or ES stage");
3309 }
3310
3311 offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
3312 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
3313 store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
3314 }
3315 }
3316
3317 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
3318 {
3319 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
3320 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3321
3322 Builder bld(ctx->program, ctx->block);
3323
3324 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
3325 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
3326 unsigned write_mask = nir_intrinsic_write_mask(instr);
3327
3328 /* TODO: Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */
3329 bool write_to_vmem = true;
3330 /* TODO: Only write to LDS if the output is read by the shader, or it's per-patch tess factor */
3331 bool write_to_lds = true;
3332
3333 if (write_to_vmem) {
3334 std::pair<Temp, unsigned> vmem_offs = per_vertex
3335 ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
3336 : get_tcs_per_patch_output_vmem_offset(ctx, instr);
3337
3338 Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
3339 Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
3340 store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, false, false);
3341 }
3342
3343 if (write_to_lds) {
3344 std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
3345 unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
3346 store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
3347 }
3348 }
3349
3350 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
3351 {
3352 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
3353 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3354
3355 Builder bld(ctx->program, ctx->block);
3356
3357 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3358 std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
3359 unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
3360 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
3361
3362 load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
3363 }
3364
3365 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
3366 {
3367 if (ctx->stage == vertex_vs ||
3368 ctx->stage == fragment_fs ||
3369 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
3370 unsigned write_mask = nir_intrinsic_write_mask(instr);
3371 unsigned component = nir_intrinsic_component(instr);
3372 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3373 unsigned idx = nir_intrinsic_base(instr) + component;
3374
3375 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
3376 if (off_instr->type != nir_instr_type_load_const) {
3377 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3378 nir_print_instr(off_instr, stderr);
3379 fprintf(stderr, "\n");
3380 }
3381 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
3382
3383 if (instr->src[0].ssa->bit_size == 64)
3384 write_mask = widen_mask(write_mask, 2);
3385
3386 for (unsigned i = 0; i < 8; ++i) {
3387 if (write_mask & (1 << i)) {
3388 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
3389 ctx->outputs.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
3390 }
3391 idx++;
3392 }
3393 } else if (ctx->stage == vertex_es ||
3394 ctx->stage == vertex_ls ||
3395 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
3396 (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX)) {
3397 visit_store_ls_or_es_output(ctx, instr);
3398 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
3399 visit_store_tcs_output(ctx, instr, false);
3400 } else {
3401 unreachable("Shader stage not implemented");
3402 }
3403 }
3404
3405 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
3406 {
3407 visit_load_tcs_output(ctx, instr, false);
3408 }
3409
3410 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
3411 {
3412 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
3413 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
3414
3415 Builder bld(ctx->program, ctx->block);
3416 Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
3417 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
3418 }
3419
3420 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
3421 {
3422 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
3423 for (unsigned i = 0; i < num_components; i++)
3424 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
3425 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
3426 assert(num_components == 4);
3427 Builder bld(ctx->program, ctx->block);
3428 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
3429 }
3430
3431 for (Operand& op : vec->operands)
3432 op = op.isUndefined() ? Operand(0u) : op;
3433
3434 vec->definitions[0] = Definition(dst);
3435 ctx->block->instructions.emplace_back(std::move(vec));
3436 emit_split_vector(ctx, dst, num_components);
3437 return;
3438 }
3439
3440 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
3441 {
3442 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3443 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
3444 unsigned idx = nir_intrinsic_base(instr);
3445 unsigned component = nir_intrinsic_component(instr);
3446 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3447
3448 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
3449 if (offset) {
3450 assert(offset->u32 == 0);
3451 } else {
3452 /* the lower 15bit of the prim_mask contain the offset into LDS
3453 * while the upper bits contain the number of prims */
3454 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
3455 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3456 Builder bld(ctx->program, ctx->block);
3457 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3458 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3459 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3460 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3461 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3462 }
3463
3464 if (instr->dest.ssa.num_components == 1) {
3465 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
3466 } else {
3467 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
3468 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
3469 {
3470 Temp tmp = {ctx->program->allocateId(), v1};
3471 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
3472 vec->operands[i] = Operand(tmp);
3473 }
3474 vec->definitions[0] = Definition(dst);
3475 ctx->block->instructions.emplace_back(std::move(vec));
3476 }
3477 }
3478
3479 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
3480 unsigned offset, unsigned stride, unsigned channels)
3481 {
3482 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
3483 if (vtx_info->chan_byte_size != 4 && channels == 3)
3484 return false;
3485 return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
3486 (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
3487 }
3488
3489 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
3490 unsigned offset, unsigned stride, unsigned *channels)
3491 {
3492 if (!vtx_info->chan_byte_size) {
3493 *channels = vtx_info->num_channels;
3494 return vtx_info->chan_format;
3495 }
3496
3497 unsigned num_channels = *channels;
3498 if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
3499 unsigned new_channels = num_channels + 1;
3500 /* first, assume more loads is worse and try using a larger data format */
3501 while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
3502 new_channels++;
3503 /* don't make the attribute potentially out-of-bounds */
3504 if (offset + new_channels * vtx_info->chan_byte_size > stride)
3505 new_channels = 5;
3506 }
3507
3508 if (new_channels == 5) {
3509 /* then try decreasing load size (at the cost of more loads) */
3510 new_channels = *channels;
3511 while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
3512 new_channels--;
3513 }
3514
3515 if (new_channels < *channels)
3516 *channels = new_channels;
3517 num_channels = new_channels;
3518 }
3519
3520 switch (vtx_info->chan_format) {
3521 case V_008F0C_BUF_DATA_FORMAT_8:
3522 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
3523 V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
3524 case V_008F0C_BUF_DATA_FORMAT_16:
3525 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
3526 V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
3527 case V_008F0C_BUF_DATA_FORMAT_32:
3528 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
3529 V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
3530 }
3531 unreachable("shouldn't reach here");
3532 return V_008F0C_BUF_DATA_FORMAT_INVALID;
3533 }
3534
3535 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
3536 * so we may need to fix it up. */
3537 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
3538 {
3539 Builder bld(ctx->program, ctx->block);
3540
3541 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
3542 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
3543
3544 /* For the integer-like cases, do a natural sign extension.
3545 *
3546 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
3547 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
3548 * exponent.
3549 */
3550 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
3551 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
3552
3553 /* Convert back to the right type. */
3554 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
3555 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3556 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
3557 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
3558 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
3559 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3560 }
3561
3562 return alpha;
3563 }
3564
3565 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
3566 {
3567 Builder bld(ctx->program, ctx->block);
3568 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3569 if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
3570
3571 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3572 if (off_instr->type != nir_instr_type_load_const) {
3573 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3574 nir_print_instr(off_instr, stderr);
3575 fprintf(stderr, "\n");
3576 }
3577 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3578
3579 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
3580
3581 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3582 unsigned component = nir_intrinsic_component(instr);
3583 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3584 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3585 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3586 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3587
3588 unsigned dfmt = attrib_format & 0xf;
3589 unsigned nfmt = (attrib_format >> 4) & 0x7;
3590 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
3591
3592 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3593 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
3594 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3595 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3596 if (post_shuffle)
3597 num_channels = MAX2(num_channels, 3);
3598
3599 Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
3600 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
3601
3602 Temp index;
3603 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3604 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3605 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
3606 if (divisor) {
3607 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
3608 if (divisor != 1) {
3609 Temp divided = bld.tmp(v1);
3610 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
3611 index = bld.vadd32(bld.def(v1), start_instance, divided);
3612 } else {
3613 index = bld.vadd32(bld.def(v1), start_instance, instance_id);
3614 }
3615 } else {
3616 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
3617 }
3618 } else {
3619 index = bld.vadd32(bld.def(v1),
3620 get_arg(ctx, ctx->args->ac.base_vertex),
3621 get_arg(ctx, ctx->args->ac.vertex_id));
3622 }
3623
3624 Temp channels[num_channels];
3625 unsigned channel_start = 0;
3626 bool direct_fetch = false;
3627
3628 /* skip unused channels at the start */
3629 if (vtx_info->chan_byte_size && !post_shuffle) {
3630 channel_start = ffs(mask) - 1;
3631 for (unsigned i = 0; i < channel_start; i++)
3632 channels[i] = Temp(0, s1);
3633 } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
3634 num_channels = 3 - (ffs(mask) - 1);
3635 }
3636
3637 /* load channels */
3638 while (channel_start < num_channels) {
3639 unsigned fetch_size = num_channels - channel_start;
3640 unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
3641 bool expanded = false;
3642
3643 /* use MUBUF when possible to avoid possible alignment issues */
3644 /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
3645 bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
3646 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
3647 nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
3648 vtx_info->chan_byte_size == 4;
3649 unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
3650 if (!use_mubuf) {
3651 fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
3652 } else {
3653 if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
3654 /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
3655 fetch_size = 4;
3656 expanded = true;
3657 }
3658 }
3659
3660 Temp fetch_index = index;
3661 if (attrib_stride != 0 && fetch_offset > attrib_stride) {
3662 fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
3663 fetch_offset = fetch_offset % attrib_stride;
3664 }
3665
3666 Operand soffset(0u);
3667 if (fetch_offset >= 4096) {
3668 soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
3669 fetch_offset %= 4096;
3670 }
3671
3672 aco_opcode opcode;
3673 switch (fetch_size) {
3674 case 1:
3675 opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
3676 break;
3677 case 2:
3678 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
3679 break;
3680 case 3:
3681 assert(ctx->options->chip_class >= GFX7 ||
3682 (!use_mubuf && ctx->options->chip_class == GFX6));
3683 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
3684 break;
3685 case 4:
3686 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
3687 break;
3688 default:
3689 unreachable("Unimplemented load_input vector size");
3690 }
3691
3692 Temp fetch_dst;
3693 if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
3694 !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
3695 num_channels <= 3)) {
3696 direct_fetch = true;
3697 fetch_dst = dst;
3698 } else {
3699 fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
3700 }
3701
3702 if (use_mubuf) {
3703 Instruction *mubuf = bld.mubuf(opcode,
3704 Definition(fetch_dst), list, fetch_index, soffset,
3705 fetch_offset, false, true).instr;
3706 static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
3707 } else {
3708 Instruction *mtbuf = bld.mtbuf(opcode,
3709 Definition(fetch_dst), list, fetch_index, soffset,
3710 fetch_dfmt, nfmt, fetch_offset, false, true).instr;
3711 static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
3712 }
3713
3714 emit_split_vector(ctx, fetch_dst, fetch_dst.size());
3715
3716 if (fetch_size == 1) {
3717 channels[channel_start] = fetch_dst;
3718 } else {
3719 for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
3720 channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
3721 }
3722
3723 channel_start += fetch_size;
3724 }
3725
3726 if (!direct_fetch) {
3727 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3728 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3729
3730 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3731 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3732 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3733
3734 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3735 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3736 unsigned num_temp = 0;
3737 for (unsigned i = 0; i < dst.size(); i++) {
3738 unsigned idx = i + component;
3739 if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
3740 Temp channel = channels[swizzle[idx]];
3741 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
3742 channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
3743 vec->operands[i] = Operand(channel);
3744
3745 num_temp++;
3746 elems[i] = channel;
3747 } else if (is_float && idx == 3) {
3748 vec->operands[i] = Operand(0x3f800000u);
3749 } else if (!is_float && idx == 3) {
3750 vec->operands[i] = Operand(1u);
3751 } else {
3752 vec->operands[i] = Operand(0u);
3753 }
3754 }
3755 vec->definitions[0] = Definition(dst);
3756 ctx->block->instructions.emplace_back(std::move(vec));
3757 emit_split_vector(ctx, dst, dst.size());
3758
3759 if (num_temp == dst.size())
3760 ctx->allocated_vec.emplace(dst.id(), elems);
3761 }
3762 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
3763 unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
3764 nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
3765 if (off_instr->type != nir_instr_type_load_const ||
3766 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3767 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3768 nir_print_instr(off_instr, stderr);
3769 fprintf(stderr, "\n");
3770 }
3771
3772 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3773 nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
3774 if (offset) {
3775 assert(offset->u32 == 0);
3776 } else {
3777 /* the lower 15bit of the prim_mask contain the offset into LDS
3778 * while the upper bits contain the number of prims */
3779 Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
3780 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3781 Builder bld(ctx->program, ctx->block);
3782 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3783 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3784 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3785 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3786 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3787 }
3788
3789 unsigned idx = nir_intrinsic_base(instr);
3790 unsigned component = nir_intrinsic_component(instr);
3791 unsigned vertex_id = 2; /* P0 */
3792
3793 if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
3794 nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
3795 switch (src0->u32) {
3796 case 0:
3797 vertex_id = 2; /* P0 */
3798 break;
3799 case 1:
3800 vertex_id = 0; /* P10 */
3801 break;
3802 case 2:
3803 vertex_id = 1; /* P20 */
3804 break;
3805 default:
3806 unreachable("invalid vertex index");
3807 }
3808 }
3809
3810 if (dst.size() == 1) {
3811 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
3812 } else {
3813 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3814 for (unsigned i = 0; i < dst.size(); i++)
3815 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
3816 vec->definitions[0] = Definition(dst);
3817 bld.insert(std::move(vec));
3818 }
3819
3820 } else {
3821 unreachable("Shader stage not implemented");
3822 }
3823 }
3824
3825 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
3826 {
3827 assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
3828
3829 Builder bld(ctx->program, ctx->block);
3830 nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
3831 Temp vertex_offset;
3832
3833 if (!nir_src_is_const(*vertex_src)) {
3834 /* better code could be created, but this case probably doesn't happen
3835 * much in practice */
3836 Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
3837 for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
3838 Temp elem;
3839
3840 if (ctx->stage == vertex_geometry_gs) {
3841 elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
3842 if (i % 2u)
3843 elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
3844 } else {
3845 elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
3846 }
3847
3848 if (vertex_offset.id()) {
3849 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
3850 Operand(i), indirect_vertex);
3851 vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
3852 } else {
3853 vertex_offset = elem;
3854 }
3855 }
3856
3857 if (ctx->stage == vertex_geometry_gs)
3858 vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
3859 } else {
3860 unsigned vertex = nir_src_as_uint(*vertex_src);
3861 if (ctx->stage == vertex_geometry_gs)
3862 vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
3863 get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
3864 Operand((vertex % 2u) * 16u), Operand(16u));
3865 else
3866 vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
3867 }
3868
3869 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
3870 offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
3871 return offset_mul(ctx, offs, 4u);
3872 }
3873
3874 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
3875 {
3876 assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
3877
3878 Builder bld(ctx->program, ctx->block);
3879 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3880 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
3881
3882 if (ctx->stage == geometry_gs) {
3883 std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
3884 Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
3885 load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
3886 } else if (ctx->stage == vertex_geometry_gs) {
3887 std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
3888 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
3889 load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
3890 } else {
3891 unreachable("Unsupported GS stage.");
3892 }
3893 }
3894
3895 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
3896 {
3897 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3898
3899 Builder bld(ctx->program, ctx->block);
3900 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3901 std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
3902 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
3903 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
3904
3905 load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
3906 }
3907
3908 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
3909 {
3910 switch (ctx->shader->info.stage) {
3911 case MESA_SHADER_GEOMETRY:
3912 visit_load_gs_per_vertex_input(ctx, instr);
3913 break;
3914 case MESA_SHADER_TESS_CTRL:
3915 visit_load_tcs_per_vertex_input(ctx, instr);
3916 break;
3917 default:
3918 unreachable("Unimplemented shader stage");
3919 }
3920 }
3921
3922 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
3923 {
3924 visit_load_tcs_output(ctx, instr, true);
3925 }
3926
3927 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
3928 {
3929 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
3930 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3931
3932 visit_store_tcs_output(ctx, instr, true);
3933 }
3934
3935 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
3936 {
3937 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
3938
3939 Builder bld(ctx->program, ctx->block);
3940 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3941
3942 Operand tes_u(get_arg(ctx, ctx->args->tes_u));
3943 Operand tes_v(get_arg(ctx, ctx->args->tes_v));
3944 Operand tes_w(0u);
3945
3946 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
3947 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
3948 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
3949 tes_w = Operand(tmp);
3950 }
3951
3952 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
3953 emit_split_vector(ctx, tess_coord, 3);
3954 }
3955
3956 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3957 {
3958 if (ctx->program->info->need_indirect_descriptor_sets) {
3959 Builder bld(ctx->program, ctx->block);
3960 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
3961 Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
3962 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
3963 }
3964
3965 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
3966 }
3967
3968
3969 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3970 {
3971 Builder bld(ctx->program, ctx->block);
3972 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3973 if (!ctx->divergent_vals[instr->dest.ssa.index])
3974 index = bld.as_uniform(index);
3975 unsigned desc_set = nir_intrinsic_desc_set(instr);
3976 unsigned binding = nir_intrinsic_binding(instr);
3977
3978 Temp desc_ptr;
3979 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3980 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3981 unsigned offset = layout->binding[binding].offset;
3982 unsigned stride;
3983 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3984 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3985 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3986 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
3987 offset = pipeline_layout->push_constant_size + 16 * idx;
3988 stride = 16;
3989 } else {
3990 desc_ptr = load_desc_ptr(ctx, desc_set);
3991 stride = layout->binding[binding].size;
3992 }
3993
3994 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3995 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3996 if (stride != 1) {
3997 if (nir_const_index) {
3998 const_index = const_index * stride;
3999 } else if (index.type() == RegType::vgpr) {
4000 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
4001 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
4002 } else {
4003 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
4004 }
4005 }
4006 if (offset) {
4007 if (nir_const_index) {
4008 const_index = const_index + offset;
4009 } else if (index.type() == RegType::vgpr) {
4010 index = bld.vadd32(bld.def(v1), Operand(offset), index);
4011 } else {
4012 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
4013 }
4014 }
4015
4016 if (nir_const_index && const_index == 0) {
4017 index = desc_ptr;
4018 } else if (index.type() == RegType::vgpr) {
4019 index = bld.vadd32(bld.def(v1),
4020 nir_const_index ? Operand(const_index) : Operand(index),
4021 Operand(desc_ptr));
4022 } else {
4023 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4024 nir_const_index ? Operand(const_index) : Operand(index),
4025 Operand(desc_ptr));
4026 }
4027
4028 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
4029 }
4030
4031 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
4032 Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
4033 {
4034 Builder bld(ctx->program, ctx->block);
4035
4036 unsigned num_bytes = dst.size() * 4;
4037 bool dlc = glc && ctx->options->chip_class >= GFX10;
4038
4039 aco_opcode op;
4040 if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
4041 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4042 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4043 unsigned const_offset = 0;
4044
4045 Temp lower = Temp();
4046 if (num_bytes > 16) {
4047 assert(num_components == 3 || num_components == 4);
4048 op = aco_opcode::buffer_load_dwordx4;
4049 lower = bld.tmp(v4);
4050 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4051 mubuf->definitions[0] = Definition(lower);
4052 mubuf->operands[0] = Operand(rsrc);
4053 mubuf->operands[1] = vaddr;
4054 mubuf->operands[2] = soffset;
4055 mubuf->offen = (offset.type() == RegType::vgpr);
4056 mubuf->glc = glc;
4057 mubuf->dlc = dlc;
4058 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
4059 mubuf->can_reorder = readonly;
4060 bld.insert(std::move(mubuf));
4061 emit_split_vector(ctx, lower, 2);
4062 num_bytes -= 16;
4063 const_offset = 16;
4064 } else if (num_bytes == 12 && ctx->options->chip_class == GFX6) {
4065 /* GFX6 doesn't support loading vec3, expand to vec4. */
4066 num_bytes = 16;
4067 }
4068
4069 switch (num_bytes) {
4070 case 4:
4071 op = aco_opcode::buffer_load_dword;
4072 break;
4073 case 8:
4074 op = aco_opcode::buffer_load_dwordx2;
4075 break;
4076 case 12:
4077 assert(ctx->options->chip_class > GFX6);
4078 op = aco_opcode::buffer_load_dwordx3;
4079 break;
4080 case 16:
4081 op = aco_opcode::buffer_load_dwordx4;
4082 break;
4083 default:
4084 unreachable("Load SSBO not implemented for this size.");
4085 }
4086 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4087 mubuf->operands[0] = Operand(rsrc);
4088 mubuf->operands[1] = vaddr;
4089 mubuf->operands[2] = soffset;
4090 mubuf->offen = (offset.type() == RegType::vgpr);
4091 mubuf->glc = glc;
4092 mubuf->dlc = dlc;
4093 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
4094 mubuf->can_reorder = readonly;
4095 mubuf->offset = const_offset;
4096 aco_ptr<Instruction> instr = std::move(mubuf);
4097
4098 if (dst.size() > 4) {
4099 assert(lower != Temp());
4100 Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
4101 instr->definitions[0] = Definition(upper);
4102 bld.insert(std::move(instr));
4103 if (dst.size() == 8)
4104 emit_split_vector(ctx, upper, 2);
4105 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
4106 instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
4107 instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
4108 instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
4109 if (dst.size() == 8)
4110 instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
4111 } else if (dst.size() == 3 && ctx->options->chip_class == GFX6) {
4112 Temp vec = bld.tmp(v4);
4113 instr->definitions[0] = Definition(vec);
4114 bld.insert(std::move(instr));
4115 emit_split_vector(ctx, vec, 4);
4116
4117 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
4118 instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
4119 instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
4120 instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
4121 }
4122
4123 if (dst.type() == RegType::sgpr) {
4124 Temp vec = bld.tmp(RegType::vgpr, dst.size());
4125 instr->definitions[0] = Definition(vec);
4126 bld.insert(std::move(instr));
4127 expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
4128 } else {
4129 instr->definitions[0] = Definition(dst);
4130 bld.insert(std::move(instr));
4131 emit_split_vector(ctx, dst, num_components);
4132 }
4133 } else {
4134 switch (num_bytes) {
4135 case 4:
4136 op = aco_opcode::s_buffer_load_dword;
4137 break;
4138 case 8:
4139 op = aco_opcode::s_buffer_load_dwordx2;
4140 break;
4141 case 12:
4142 case 16:
4143 op = aco_opcode::s_buffer_load_dwordx4;
4144 break;
4145 case 24:
4146 case 32:
4147 op = aco_opcode::s_buffer_load_dwordx8;
4148 break;
4149 default:
4150 unreachable("Load SSBO not implemented for this size.");
4151 }
4152 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4153 load->operands[0] = Operand(rsrc);
4154 load->operands[1] = Operand(bld.as_uniform(offset));
4155 assert(load->operands[1].getTemp().type() == RegType::sgpr);
4156 load->definitions[0] = Definition(dst);
4157 load->glc = glc;
4158 load->dlc = dlc;
4159 load->barrier = readonly ? barrier_none : barrier_buffer;
4160 load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
4161 assert(ctx->options->chip_class >= GFX8 || !glc);
4162
4163 /* trim vector */
4164 if (dst.size() == 3) {
4165 Temp vec = bld.tmp(s4);
4166 load->definitions[0] = Definition(vec);
4167 bld.insert(std::move(load));
4168 emit_split_vector(ctx, vec, 4);
4169
4170 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4171 emit_extract_vector(ctx, vec, 0, s1),
4172 emit_extract_vector(ctx, vec, 1, s1),
4173 emit_extract_vector(ctx, vec, 2, s1));
4174 } else if (dst.size() == 6) {
4175 Temp vec = bld.tmp(s8);
4176 load->definitions[0] = Definition(vec);
4177 bld.insert(std::move(load));
4178 emit_split_vector(ctx, vec, 4);
4179
4180 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4181 emit_extract_vector(ctx, vec, 0, s2),
4182 emit_extract_vector(ctx, vec, 1, s2),
4183 emit_extract_vector(ctx, vec, 2, s2));
4184 } else {
4185 bld.insert(std::move(load));
4186 }
4187 emit_split_vector(ctx, dst, num_components);
4188 }
4189 }
4190
4191 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
4192 {
4193 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4194 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
4195
4196 Builder bld(ctx->program, ctx->block);
4197
4198 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4199 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
4200 unsigned binding = nir_intrinsic_binding(idx_instr);
4201 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
4202
4203 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
4204 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4205 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4206 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4207 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4208 if (ctx->options->chip_class >= GFX10) {
4209 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4210 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
4211 S_008F0C_RESOURCE_LEVEL(1);
4212 } else {
4213 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4214 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4215 }
4216 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
4217 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
4218 Operand(0xFFFFFFFFu),
4219 Operand(desc_type));
4220 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
4221 rsrc, upper_dwords);
4222 } else {
4223 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
4224 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4225 }
4226
4227 load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
4228 }
4229
4230 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
4231 {
4232 Builder bld(ctx->program, ctx->block);
4233 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4234
4235 unsigned offset = nir_intrinsic_base(instr);
4236 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
4237 if (index_cv && instr->dest.ssa.bit_size == 32) {
4238
4239 unsigned count = instr->dest.ssa.num_components;
4240 unsigned start = (offset + index_cv->u32) / 4u;
4241 start -= ctx->args->ac.base_inline_push_consts;
4242 if (start + count <= ctx->args->ac.num_inline_push_consts) {
4243 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4244 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4245 for (unsigned i = 0; i < count; ++i) {
4246 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
4247 vec->operands[i] = Operand{elems[i]};
4248 }
4249 vec->definitions[0] = Definition(dst);
4250 ctx->block->instructions.emplace_back(std::move(vec));
4251 ctx->allocated_vec.emplace(dst.id(), elems);
4252 return;
4253 }
4254 }
4255
4256 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
4257 if (offset != 0) // TODO check if index != 0 as well
4258 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
4259 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
4260 Temp vec = dst;
4261 bool trim = false;
4262 aco_opcode op;
4263
4264 switch (dst.size()) {
4265 case 1:
4266 op = aco_opcode::s_load_dword;
4267 break;
4268 case 2:
4269 op = aco_opcode::s_load_dwordx2;
4270 break;
4271 case 3:
4272 vec = bld.tmp(s4);
4273 trim = true;
4274 case 4:
4275 op = aco_opcode::s_load_dwordx4;
4276 break;
4277 case 6:
4278 vec = bld.tmp(s8);
4279 trim = true;
4280 case 8:
4281 op = aco_opcode::s_load_dwordx8;
4282 break;
4283 default:
4284 unreachable("unimplemented or forbidden load_push_constant.");
4285 }
4286
4287 bld.smem(op, Definition(vec), ptr, index);
4288
4289 if (trim) {
4290 emit_split_vector(ctx, vec, 4);
4291 RegClass rc = dst.size() == 3 ? s1 : s2;
4292 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4293 emit_extract_vector(ctx, vec, 0, rc),
4294 emit_extract_vector(ctx, vec, 1, rc),
4295 emit_extract_vector(ctx, vec, 2, rc));
4296
4297 }
4298 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4299 }
4300
4301 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
4302 {
4303 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4304
4305 Builder bld(ctx->program, ctx->block);
4306
4307 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4308 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4309 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4310 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4311 if (ctx->options->chip_class >= GFX10) {
4312 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4313 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
4314 S_008F0C_RESOURCE_LEVEL(1);
4315 } else {
4316 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4317 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4318 }
4319
4320 unsigned base = nir_intrinsic_base(instr);
4321 unsigned range = nir_intrinsic_range(instr);
4322
4323 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
4324 if (base && offset.type() == RegType::sgpr)
4325 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
4326 else if (base && offset.type() == RegType::vgpr)
4327 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
4328
4329 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
4330 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
4331 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
4332 Operand(desc_type));
4333
4334 load_buffer(ctx, instr->num_components, dst, rsrc, offset);
4335 }
4336
4337 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
4338 {
4339 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
4340 ctx->cf_info.exec_potentially_empty_discard = true;
4341
4342 ctx->program->needs_exact = true;
4343
4344 // TODO: optimize uniform conditions
4345 Builder bld(ctx->program, ctx->block);
4346 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4347 assert(src.regClass() == bld.lm);
4348 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
4349 bld.pseudo(aco_opcode::p_discard_if, src);
4350 ctx->block->kind |= block_kind_uses_discard_if;
4351 return;
4352 }
4353
4354 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
4355 {
4356 Builder bld(ctx->program, ctx->block);
4357
4358 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
4359 ctx->cf_info.exec_potentially_empty_discard = true;
4360
4361 bool divergent = ctx->cf_info.parent_if.is_divergent ||
4362 ctx->cf_info.parent_loop.has_divergent_continue;
4363
4364 if (ctx->block->loop_nest_depth &&
4365 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
4366 /* we handle discards the same way as jump instructions */
4367 append_logical_end(ctx->block);
4368
4369 /* in loops, discard behaves like break */
4370 Block *linear_target = ctx->cf_info.parent_loop.exit;
4371 ctx->block->kind |= block_kind_discard;
4372
4373 if (!divergent) {
4374 /* uniform discard - loop ends here */
4375 assert(nir_instr_is_last(&instr->instr));
4376 ctx->block->kind |= block_kind_uniform;
4377 ctx->cf_info.has_branch = true;
4378 bld.branch(aco_opcode::p_branch);
4379 add_linear_edge(ctx->block->index, linear_target);
4380 return;
4381 }
4382
4383 /* we add a break right behind the discard() instructions */
4384 ctx->block->kind |= block_kind_break;
4385 unsigned idx = ctx->block->index;
4386
4387 /* remove critical edges from linear CFG */
4388 bld.branch(aco_opcode::p_branch);
4389 Block* break_block = ctx->program->create_and_insert_block();
4390 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
4391 break_block->kind |= block_kind_uniform;
4392 add_linear_edge(idx, break_block);
4393 add_linear_edge(break_block->index, linear_target);
4394 bld.reset(break_block);
4395 bld.branch(aco_opcode::p_branch);
4396
4397 Block* continue_block = ctx->program->create_and_insert_block();
4398 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
4399 add_linear_edge(idx, continue_block);
4400 append_logical_start(continue_block);
4401 ctx->block = continue_block;
4402
4403 return;
4404 }
4405
4406 /* it can currently happen that NIR doesn't remove the unreachable code */
4407 if (!nir_instr_is_last(&instr->instr)) {
4408 ctx->program->needs_exact = true;
4409 /* save exec somewhere temporarily so that it doesn't get
4410 * overwritten before the discard from outer exec masks */
4411 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
4412 bld.pseudo(aco_opcode::p_discard_if, cond);
4413 ctx->block->kind |= block_kind_uses_discard_if;
4414 return;
4415 }
4416
4417 /* This condition is incorrect for uniformly branched discards in a loop
4418 * predicated by a divergent condition, but the above code catches that case
4419 * and the discard would end up turning into a discard_if.
4420 * For example:
4421 * if (divergent) {
4422 * while (...) {
4423 * if (uniform) {
4424 * discard;
4425 * }
4426 * }
4427 * }
4428 */
4429 if (!ctx->cf_info.parent_if.is_divergent) {
4430 /* program just ends here */
4431 ctx->block->kind |= block_kind_uniform;
4432 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
4433 0 /* enabled mask */, 9 /* dest */,
4434 false /* compressed */, true/* done */, true /* valid mask */);
4435 bld.sopp(aco_opcode::s_endpgm);
4436 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
4437 } else {
4438 ctx->block->kind |= block_kind_discard;
4439 /* branch and linear edge is added by visit_if() */
4440 }
4441 }
4442
4443 enum aco_descriptor_type {
4444 ACO_DESC_IMAGE,
4445 ACO_DESC_FMASK,
4446 ACO_DESC_SAMPLER,
4447 ACO_DESC_BUFFER,
4448 ACO_DESC_PLANE_0,
4449 ACO_DESC_PLANE_1,
4450 ACO_DESC_PLANE_2,
4451 };
4452
4453 static bool
4454 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
4455 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
4456 return false;
4457 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
4458 return dim == ac_image_cube ||
4459 dim == ac_image_1darray ||
4460 dim == ac_image_2darray ||
4461 dim == ac_image_2darraymsaa;
4462 }
4463
4464 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
4465 enum aco_descriptor_type desc_type,
4466 const nir_tex_instr *tex_instr, bool image, bool write)
4467 {
4468 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
4469 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
4470 if (it != ctx->tex_desc.end())
4471 return it->second;
4472 */
4473 Temp index = Temp();
4474 bool index_set = false;
4475 unsigned constant_index = 0;
4476 unsigned descriptor_set;
4477 unsigned base_index;
4478 Builder bld(ctx->program, ctx->block);
4479
4480 if (!deref_instr) {
4481 assert(tex_instr && !image);
4482 descriptor_set = 0;
4483 base_index = tex_instr->sampler_index;
4484 } else {
4485 while(deref_instr->deref_type != nir_deref_type_var) {
4486 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
4487 if (!array_size)
4488 array_size = 1;
4489
4490 assert(deref_instr->deref_type == nir_deref_type_array);
4491 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
4492 if (const_value) {
4493 constant_index += array_size * const_value->u32;
4494 } else {
4495 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
4496 if (indirect.type() == RegType::vgpr)
4497 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
4498
4499 if (array_size != 1)
4500 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
4501
4502 if (!index_set) {
4503 index = indirect;
4504 index_set = true;
4505 } else {
4506 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
4507 }
4508 }
4509
4510 deref_instr = nir_src_as_deref(deref_instr->parent);
4511 }
4512 descriptor_set = deref_instr->var->data.descriptor_set;
4513 base_index = deref_instr->var->data.binding;
4514 }
4515
4516 Temp list = load_desc_ptr(ctx, descriptor_set);
4517 list = convert_pointer_to_64_bit(ctx, list);
4518
4519 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
4520 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
4521 unsigned offset = binding->offset;
4522 unsigned stride = binding->size;
4523 aco_opcode opcode;
4524 RegClass type;
4525
4526 assert(base_index < layout->binding_count);
4527
4528 switch (desc_type) {
4529 case ACO_DESC_IMAGE:
4530 type = s8;
4531 opcode = aco_opcode::s_load_dwordx8;
4532 break;
4533 case ACO_DESC_FMASK:
4534 type = s8;
4535 opcode = aco_opcode::s_load_dwordx8;
4536 offset += 32;
4537 break;
4538 case ACO_DESC_SAMPLER:
4539 type = s4;
4540 opcode = aco_opcode::s_load_dwordx4;
4541 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
4542 offset += radv_combined_image_descriptor_sampler_offset(binding);
4543 break;
4544 case ACO_DESC_BUFFER:
4545 type = s4;
4546 opcode = aco_opcode::s_load_dwordx4;
4547 break;
4548 case ACO_DESC_PLANE_0:
4549 case ACO_DESC_PLANE_1:
4550 type = s8;
4551 opcode = aco_opcode::s_load_dwordx8;
4552 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
4553 break;
4554 case ACO_DESC_PLANE_2:
4555 type = s4;
4556 opcode = aco_opcode::s_load_dwordx4;
4557 offset += 64;
4558 break;
4559 default:
4560 unreachable("invalid desc_type\n");
4561 }
4562
4563 offset += constant_index * stride;
4564
4565 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
4566 (!index_set || binding->immutable_samplers_equal)) {
4567 if (binding->immutable_samplers_equal)
4568 constant_index = 0;
4569
4570 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
4571 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
4572 Operand(samplers[constant_index * 4 + 0]),
4573 Operand(samplers[constant_index * 4 + 1]),
4574 Operand(samplers[constant_index * 4 + 2]),
4575 Operand(samplers[constant_index * 4 + 3]));
4576 }
4577
4578 Operand off;
4579 if (!index_set) {
4580 off = bld.copy(bld.def(s1), Operand(offset));
4581 } else {
4582 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
4583 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
4584 }
4585
4586 Temp res = bld.smem(opcode, bld.def(type), list, off);
4587
4588 if (desc_type == ACO_DESC_PLANE_2) {
4589 Temp components[8];
4590 for (unsigned i = 0; i < 8; i++)
4591 components[i] = bld.tmp(s1);
4592 bld.pseudo(aco_opcode::p_split_vector,
4593 Definition(components[0]),
4594 Definition(components[1]),
4595 Definition(components[2]),
4596 Definition(components[3]),
4597 res);
4598
4599 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
4600 bld.pseudo(aco_opcode::p_split_vector,
4601 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
4602 Definition(components[4]),
4603 Definition(components[5]),
4604 Definition(components[6]),
4605 Definition(components[7]),
4606 desc2);
4607
4608 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
4609 components[0], components[1], components[2], components[3],
4610 components[4], components[5], components[6], components[7]);
4611 }
4612
4613 return res;
4614 }
4615
4616 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
4617 {
4618 switch (dim) {
4619 case GLSL_SAMPLER_DIM_BUF:
4620 return 1;
4621 case GLSL_SAMPLER_DIM_1D:
4622 return array ? 2 : 1;
4623 case GLSL_SAMPLER_DIM_2D:
4624 return array ? 3 : 2;
4625 case GLSL_SAMPLER_DIM_MS:
4626 return array ? 4 : 3;
4627 case GLSL_SAMPLER_DIM_3D:
4628 case GLSL_SAMPLER_DIM_CUBE:
4629 return 3;
4630 case GLSL_SAMPLER_DIM_RECT:
4631 case GLSL_SAMPLER_DIM_SUBPASS:
4632 return 2;
4633 case GLSL_SAMPLER_DIM_SUBPASS_MS:
4634 return 3;
4635 default:
4636 break;
4637 }
4638 return 0;
4639 }
4640
4641
4642 /* Adjust the sample index according to FMASK.
4643 *
4644 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4645 * which is the identity mapping. Each nibble says which physical sample
4646 * should be fetched to get that sample.
4647 *
4648 * For example, 0x11111100 means there are only 2 samples stored and
4649 * the second sample covers 3/4 of the pixel. When reading samples 0
4650 * and 1, return physical sample 0 (determined by the first two 0s
4651 * in FMASK), otherwise return physical sample 1.
4652 *
4653 * The sample index should be adjusted as follows:
4654 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4655 */
4656 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
4657 {
4658 Builder bld(ctx->program, ctx->block);
4659 Temp fmask = bld.tmp(v1);
4660 unsigned dim = ctx->options->chip_class >= GFX10
4661 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
4662 : 0;
4663
4664 Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
4665 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
4666 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
4667 load->operands[0] = Operand(fmask_desc_ptr);
4668 load->operands[1] = Operand(s4); /* no sampler */
4669 load->operands[2] = Operand(coord);
4670 load->definitions[0] = Definition(fmask);
4671 load->glc = false;
4672 load->dlc = false;
4673 load->dmask = 0x1;
4674 load->unrm = true;
4675 load->da = da;
4676 load->dim = dim;
4677 load->can_reorder = true; /* fmask images shouldn't be modified */
4678 ctx->block->instructions.emplace_back(std::move(load));
4679
4680 Operand sample_index4;
4681 if (sample_index.isConstant() && sample_index.constantValue() < 16) {
4682 sample_index4 = Operand(sample_index.constantValue() << 2);
4683 } else if (sample_index.regClass() == s1) {
4684 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
4685 } else {
4686 assert(sample_index.regClass() == v1);
4687 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
4688 }
4689
4690 Temp final_sample;
4691 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
4692 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
4693 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
4694 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
4695 else
4696 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
4697
4698 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4699 * resource descriptor is 0 (invalid),
4700 */
4701 Temp compare = bld.tmp(bld.lm);
4702 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
4703 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
4704
4705 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
4706
4707 /* Replace the MSAA sample index. */
4708 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
4709 }
4710
4711 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
4712 {
4713
4714 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
4715 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4716 bool is_array = glsl_sampler_type_is_array(type);
4717 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
4718 assert(!add_frag_pos && "Input attachments should be lowered.");
4719 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
4720 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
4721 int count = image_type_to_components_count(dim, is_array);
4722 std::vector<Temp> coords(count);
4723 Builder bld(ctx->program, ctx->block);
4724
4725 if (is_ms) {
4726 count--;
4727 Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
4728 /* get sample index */
4729 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
4730 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
4731 Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
4732 std::vector<Temp> fmask_load_address;
4733 for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
4734 fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
4735
4736 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
4737 coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
4738 } else {
4739 coords[count] = emit_extract_vector(ctx, src2, 0, v1);
4740 }
4741 }
4742
4743 if (gfx9_1d) {
4744 coords[0] = emit_extract_vector(ctx, src0, 0, v1);
4745 coords.resize(coords.size() + 1);
4746 coords[1] = bld.copy(bld.def(v1), Operand(0u));
4747 if (is_array)
4748 coords[2] = emit_extract_vector(ctx, src0, 1, v1);
4749 } else {
4750 for (int i = 0; i < count; i++)
4751 coords[i] = emit_extract_vector(ctx, src0, i, v1);
4752 }
4753
4754 if (instr->intrinsic == nir_intrinsic_image_deref_load ||
4755 instr->intrinsic == nir_intrinsic_image_deref_store) {
4756 int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
4757 bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
4758
4759 if (!level_zero)
4760 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
4761 }
4762
4763 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
4764 for (unsigned i = 0; i < coords.size(); i++)
4765 vec->operands[i] = Operand(coords[i]);
4766 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
4767 vec->definitions[0] = Definition(res);
4768 ctx->block->instructions.emplace_back(std::move(vec));
4769 return res;
4770 }
4771
4772
4773 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
4774 {
4775 Builder bld(ctx->program, ctx->block);
4776 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4777 const struct glsl_type *type = glsl_without_array(var->type);
4778 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4779 bool is_array = glsl_sampler_type_is_array(type);
4780 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4781
4782 if (dim == GLSL_SAMPLER_DIM_BUF) {
4783 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
4784 unsigned num_channels = util_last_bit(mask);
4785 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4786 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4787
4788 aco_opcode opcode;
4789 switch (num_channels) {
4790 case 1:
4791 opcode = aco_opcode::buffer_load_format_x;
4792 break;
4793 case 2:
4794 opcode = aco_opcode::buffer_load_format_xy;
4795 break;
4796 case 3:
4797 opcode = aco_opcode::buffer_load_format_xyz;
4798 break;
4799 case 4:
4800 opcode = aco_opcode::buffer_load_format_xyzw;
4801 break;
4802 default:
4803 unreachable(">4 channel buffer image load");
4804 }
4805 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
4806 load->operands[0] = Operand(rsrc);
4807 load->operands[1] = Operand(vindex);
4808 load->operands[2] = Operand((uint32_t) 0);
4809 Temp tmp;
4810 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4811 tmp = dst;
4812 else
4813 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4814 load->definitions[0] = Definition(tmp);
4815 load->idxen = true;
4816 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
4817 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4818 load->barrier = barrier_image;
4819 ctx->block->instructions.emplace_back(std::move(load));
4820
4821 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4822 return;
4823 }
4824
4825 Temp coords = get_image_coords(ctx, instr, type);
4826 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4827
4828 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4829 unsigned num_components = util_bitcount(dmask);
4830 Temp tmp;
4831 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4832 tmp = dst;
4833 else
4834 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4835
4836 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
4837 aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
4838
4839 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
4840 load->operands[0] = Operand(resource);
4841 load->operands[1] = Operand(s4); /* no sampler */
4842 load->operands[2] = Operand(coords);
4843 load->definitions[0] = Definition(tmp);
4844 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4845 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4846 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4847 load->dmask = dmask;
4848 load->unrm = true;
4849 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4850 load->barrier = barrier_image;
4851 ctx->block->instructions.emplace_back(std::move(load));
4852
4853 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4854 return;
4855 }
4856
4857 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4858 {
4859 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4860 const struct glsl_type *type = glsl_without_array(var->type);
4861 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4862 bool is_array = glsl_sampler_type_is_array(type);
4863 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4864
4865 bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4866
4867 if (dim == GLSL_SAMPLER_DIM_BUF) {
4868 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4869 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4870 aco_opcode opcode;
4871 switch (data.size()) {
4872 case 1:
4873 opcode = aco_opcode::buffer_store_format_x;
4874 break;
4875 case 2:
4876 opcode = aco_opcode::buffer_store_format_xy;
4877 break;
4878 case 3:
4879 opcode = aco_opcode::buffer_store_format_xyz;
4880 break;
4881 case 4:
4882 opcode = aco_opcode::buffer_store_format_xyzw;
4883 break;
4884 default:
4885 unreachable(">4 channel buffer image store");
4886 }
4887 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4888 store->operands[0] = Operand(rsrc);
4889 store->operands[1] = Operand(vindex);
4890 store->operands[2] = Operand((uint32_t) 0);
4891 store->operands[3] = Operand(data);
4892 store->idxen = true;
4893 store->glc = glc;
4894 store->dlc = false;
4895 store->disable_wqm = true;
4896 store->barrier = barrier_image;
4897 ctx->program->needs_exact = true;
4898 ctx->block->instructions.emplace_back(std::move(store));
4899 return;
4900 }
4901
4902 assert(data.type() == RegType::vgpr);
4903 Temp coords = get_image_coords(ctx, instr, type);
4904 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4905
4906 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
4907 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
4908
4909 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
4910 store->operands[0] = Operand(resource);
4911 store->operands[1] = Operand(data);
4912 store->operands[2] = Operand(coords);
4913 store->glc = glc;
4914 store->dlc = false;
4915 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4916 store->dmask = (1 << data.size()) - 1;
4917 store->unrm = true;
4918 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4919 store->disable_wqm = true;
4920 store->barrier = barrier_image;
4921 ctx->program->needs_exact = true;
4922 ctx->block->instructions.emplace_back(std::move(store));
4923 return;
4924 }
4925
4926 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4927 {
4928 /* return the previous value if dest is ever used */
4929 bool return_previous = false;
4930 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4931 return_previous = true;
4932 break;
4933 }
4934 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4935 return_previous = true;
4936 break;
4937 }
4938
4939 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4940 const struct glsl_type *type = glsl_without_array(var->type);
4941 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4942 bool is_array = glsl_sampler_type_is_array(type);
4943 Builder bld(ctx->program, ctx->block);
4944
4945 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4946 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4947
4948 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4949 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4950
4951 aco_opcode buf_op, image_op;
4952 switch (instr->intrinsic) {
4953 case nir_intrinsic_image_deref_atomic_add:
4954 buf_op = aco_opcode::buffer_atomic_add;
4955 image_op = aco_opcode::image_atomic_add;
4956 break;
4957 case nir_intrinsic_image_deref_atomic_umin:
4958 buf_op = aco_opcode::buffer_atomic_umin;
4959 image_op = aco_opcode::image_atomic_umin;
4960 break;
4961 case nir_intrinsic_image_deref_atomic_imin:
4962 buf_op = aco_opcode::buffer_atomic_smin;
4963 image_op = aco_opcode::image_atomic_smin;
4964 break;
4965 case nir_intrinsic_image_deref_atomic_umax:
4966 buf_op = aco_opcode::buffer_atomic_umax;
4967 image_op = aco_opcode::image_atomic_umax;
4968 break;
4969 case nir_intrinsic_image_deref_atomic_imax:
4970 buf_op = aco_opcode::buffer_atomic_smax;
4971 image_op = aco_opcode::image_atomic_smax;
4972 break;
4973 case nir_intrinsic_image_deref_atomic_and:
4974 buf_op = aco_opcode::buffer_atomic_and;
4975 image_op = aco_opcode::image_atomic_and;
4976 break;
4977 case nir_intrinsic_image_deref_atomic_or:
4978 buf_op = aco_opcode::buffer_atomic_or;
4979 image_op = aco_opcode::image_atomic_or;
4980 break;
4981 case nir_intrinsic_image_deref_atomic_xor:
4982 buf_op = aco_opcode::buffer_atomic_xor;
4983 image_op = aco_opcode::image_atomic_xor;
4984 break;
4985 case nir_intrinsic_image_deref_atomic_exchange:
4986 buf_op = aco_opcode::buffer_atomic_swap;
4987 image_op = aco_opcode::image_atomic_swap;
4988 break;
4989 case nir_intrinsic_image_deref_atomic_comp_swap:
4990 buf_op = aco_opcode::buffer_atomic_cmpswap;
4991 image_op = aco_opcode::image_atomic_cmpswap;
4992 break;
4993 default:
4994 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4995 }
4996
4997 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4998
4999 if (dim == GLSL_SAMPLER_DIM_BUF) {
5000 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5001 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5002 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
5003 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
5004 mubuf->operands[0] = Operand(resource);
5005 mubuf->operands[1] = Operand(vindex);
5006 mubuf->operands[2] = Operand((uint32_t)0);
5007 mubuf->operands[3] = Operand(data);
5008 if (return_previous)
5009 mubuf->definitions[0] = Definition(dst);
5010 mubuf->offset = 0;
5011 mubuf->idxen = true;
5012 mubuf->glc = return_previous;
5013 mubuf->dlc = false; /* Not needed for atomics */
5014 mubuf->disable_wqm = true;
5015 mubuf->barrier = barrier_image;
5016 ctx->program->needs_exact = true;
5017 ctx->block->instructions.emplace_back(std::move(mubuf));
5018 return;
5019 }
5020
5021 Temp coords = get_image_coords(ctx, instr, type);
5022 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5023 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
5024 mimg->operands[0] = Operand(resource);
5025 mimg->operands[1] = Operand(data);
5026 mimg->operands[2] = Operand(coords);
5027 if (return_previous)
5028 mimg->definitions[0] = Definition(dst);
5029 mimg->glc = return_previous;
5030 mimg->dlc = false; /* Not needed for atomics */
5031 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5032 mimg->dmask = (1 << data.size()) - 1;
5033 mimg->unrm = true;
5034 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5035 mimg->disable_wqm = true;
5036 mimg->barrier = barrier_image;
5037 ctx->program->needs_exact = true;
5038 ctx->block->instructions.emplace_back(std::move(mimg));
5039 return;
5040 }
5041
5042 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
5043 {
5044 if (in_elements && ctx->options->chip_class == GFX8) {
5045 /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
5046 Builder bld(ctx->program, ctx->block);
5047
5048 Temp size = emit_extract_vector(ctx, desc, 2, s1);
5049
5050 Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
5051 size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
5052
5053 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
5054 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
5055
5056 Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
5057 size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
5058
5059 Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
5060 bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
5061 size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
5062 if (dst.type() == RegType::vgpr)
5063 bld.copy(Definition(dst), shr_dst);
5064
5065 /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
5066 } else {
5067 emit_extract_vector(ctx, desc, 2, dst);
5068 }
5069 }
5070
5071 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
5072 {
5073 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5074 const struct glsl_type *type = glsl_without_array(var->type);
5075 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5076 bool is_array = glsl_sampler_type_is_array(type);
5077 Builder bld(ctx->program, ctx->block);
5078
5079 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
5080 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
5081 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
5082 }
5083
5084 /* LOD */
5085 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
5086
5087 /* Resource */
5088 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
5089
5090 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5091
5092 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
5093 mimg->operands[0] = Operand(resource);
5094 mimg->operands[1] = Operand(s4); /* no sampler */
5095 mimg->operands[2] = Operand(lod);
5096 uint8_t& dmask = mimg->dmask;
5097 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5098 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
5099 mimg->da = glsl_sampler_type_is_array(type);
5100 mimg->can_reorder = true;
5101 Definition& def = mimg->definitions[0];
5102 ctx->block->instructions.emplace_back(std::move(mimg));
5103
5104 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
5105 glsl_sampler_type_is_array(type)) {
5106
5107 assert(instr->dest.ssa.num_components == 3);
5108 Temp tmp = {ctx->program->allocateId(), v3};
5109 def = Definition(tmp);
5110 emit_split_vector(ctx, tmp, 3);
5111
5112 /* divide 3rd value by 6 by multiplying with magic number */
5113 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
5114 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
5115
5116 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5117 emit_extract_vector(ctx, tmp, 0, v1),
5118 emit_extract_vector(ctx, tmp, 1, v1),
5119 by_6);
5120
5121 } else if (ctx->options->chip_class == GFX9 &&
5122 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
5123 glsl_sampler_type_is_array(type)) {
5124 assert(instr->dest.ssa.num_components == 2);
5125 def = Definition(dst);
5126 dmask = 0x5;
5127 } else {
5128 def = Definition(dst);
5129 }
5130
5131 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5132 }
5133
5134 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5135 {
5136 Builder bld(ctx->program, ctx->block);
5137 unsigned num_components = instr->num_components;
5138
5139 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5140 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5141 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5142
5143 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
5144 load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
5145 }
5146
5147 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5148 {
5149 Builder bld(ctx->program, ctx->block);
5150 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
5151 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5152 unsigned writemask = nir_intrinsic_write_mask(instr);
5153 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
5154
5155 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5156 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5157
5158 bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
5159 ctx->options->chip_class >= GFX8;
5160 if (smem)
5161 offset = bld.as_uniform(offset);
5162 bool smem_nonfs = smem && ctx->stage != fragment_fs;
5163
5164 while (writemask) {
5165 int start, count;
5166 u_bit_scan_consecutive_range(&writemask, &start, &count);
5167 if (count == 3 && (smem || ctx->options->chip_class == GFX6)) {
5168 /* GFX6 doesn't support storing vec3, split it. */
5169 writemask |= 1u << (start + 2);
5170 count = 2;
5171 }
5172 int num_bytes = count * elem_size_bytes;
5173
5174 if (num_bytes > 16) {
5175 assert(elem_size_bytes == 8);
5176 writemask |= (((count - 2) << 1) - 1) << (start + 2);
5177 count = 2;
5178 num_bytes = 16;
5179 }
5180
5181 // TODO: check alignment of sub-dword stores
5182 // TODO: split 3 bytes. there is no store instruction for that
5183
5184 Temp write_data;
5185 if (count != instr->num_components) {
5186 emit_split_vector(ctx, data, instr->num_components);
5187 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5188 for (int i = 0; i < count; i++) {
5189 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
5190 vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
5191 }
5192 write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
5193 vec->definitions[0] = Definition(write_data);
5194 ctx->block->instructions.emplace_back(std::move(vec));
5195 } else if (!smem && data.type() != RegType::vgpr) {
5196 assert(num_bytes % 4 == 0);
5197 write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
5198 } else if (smem_nonfs && data.type() == RegType::vgpr) {
5199 assert(num_bytes % 4 == 0);
5200 write_data = bld.as_uniform(data);
5201 } else {
5202 write_data = data;
5203 }
5204
5205 aco_opcode vmem_op, smem_op;
5206 switch (num_bytes) {
5207 case 4:
5208 vmem_op = aco_opcode::buffer_store_dword;
5209 smem_op = aco_opcode::s_buffer_store_dword;
5210 break;
5211 case 8:
5212 vmem_op = aco_opcode::buffer_store_dwordx2;
5213 smem_op = aco_opcode::s_buffer_store_dwordx2;
5214 break;
5215 case 12:
5216 vmem_op = aco_opcode::buffer_store_dwordx3;
5217 smem_op = aco_opcode::last_opcode;
5218 assert(!smem && ctx->options->chip_class > GFX6);
5219 break;
5220 case 16:
5221 vmem_op = aco_opcode::buffer_store_dwordx4;
5222 smem_op = aco_opcode::s_buffer_store_dwordx4;
5223 break;
5224 default:
5225 unreachable("Store SSBO not implemented for this size.");
5226 }
5227 if (ctx->stage == fragment_fs)
5228 smem_op = aco_opcode::p_fs_buffer_store_smem;
5229
5230 if (smem) {
5231 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
5232 store->operands[0] = Operand(rsrc);
5233 if (start) {
5234 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5235 offset, Operand(start * elem_size_bytes));
5236 store->operands[1] = Operand(off);
5237 } else {
5238 store->operands[1] = Operand(offset);
5239 }
5240 if (smem_op != aco_opcode::p_fs_buffer_store_smem)
5241 store->operands[1].setFixed(m0);
5242 store->operands[2] = Operand(write_data);
5243 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
5244 store->dlc = false;
5245 store->disable_wqm = true;
5246 store->barrier = barrier_buffer;
5247 ctx->block->instructions.emplace_back(std::move(store));
5248 ctx->program->wb_smem_l1_on_end = true;
5249 if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
5250 ctx->block->kind |= block_kind_needs_lowering;
5251 ctx->program->needs_exact = true;
5252 }
5253 } else {
5254 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
5255 store->operands[0] = Operand(rsrc);
5256 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5257 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
5258 store->operands[3] = Operand(write_data);
5259 store->offset = start * elem_size_bytes;
5260 store->offen = (offset.type() == RegType::vgpr);
5261 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
5262 store->dlc = false;
5263 store->disable_wqm = true;
5264 store->barrier = barrier_buffer;
5265 ctx->program->needs_exact = true;
5266 ctx->block->instructions.emplace_back(std::move(store));
5267 }
5268 }
5269 }
5270
5271 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5272 {
5273 /* return the previous value if dest is ever used */
5274 bool return_previous = false;
5275 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5276 return_previous = true;
5277 break;
5278 }
5279 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5280 return_previous = true;
5281 break;
5282 }
5283
5284 Builder bld(ctx->program, ctx->block);
5285 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
5286
5287 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
5288 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
5289 get_ssa_temp(ctx, instr->src[3].ssa), data);
5290
5291 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
5292 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5293 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5294
5295 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5296
5297 aco_opcode op32, op64;
5298 switch (instr->intrinsic) {
5299 case nir_intrinsic_ssbo_atomic_add:
5300 op32 = aco_opcode::buffer_atomic_add;
5301 op64 = aco_opcode::buffer_atomic_add_x2;
5302 break;
5303 case nir_intrinsic_ssbo_atomic_imin:
5304 op32 = aco_opcode::buffer_atomic_smin;
5305 op64 = aco_opcode::buffer_atomic_smin_x2;
5306 break;
5307 case nir_intrinsic_ssbo_atomic_umin:
5308 op32 = aco_opcode::buffer_atomic_umin;
5309 op64 = aco_opcode::buffer_atomic_umin_x2;
5310 break;
5311 case nir_intrinsic_ssbo_atomic_imax:
5312 op32 = aco_opcode::buffer_atomic_smax;
5313 op64 = aco_opcode::buffer_atomic_smax_x2;
5314 break;
5315 case nir_intrinsic_ssbo_atomic_umax:
5316 op32 = aco_opcode::buffer_atomic_umax;
5317 op64 = aco_opcode::buffer_atomic_umax_x2;
5318 break;
5319 case nir_intrinsic_ssbo_atomic_and:
5320 op32 = aco_opcode::buffer_atomic_and;
5321 op64 = aco_opcode::buffer_atomic_and_x2;
5322 break;
5323 case nir_intrinsic_ssbo_atomic_or:
5324 op32 = aco_opcode::buffer_atomic_or;
5325 op64 = aco_opcode::buffer_atomic_or_x2;
5326 break;
5327 case nir_intrinsic_ssbo_atomic_xor:
5328 op32 = aco_opcode::buffer_atomic_xor;
5329 op64 = aco_opcode::buffer_atomic_xor_x2;
5330 break;
5331 case nir_intrinsic_ssbo_atomic_exchange:
5332 op32 = aco_opcode::buffer_atomic_swap;
5333 op64 = aco_opcode::buffer_atomic_swap_x2;
5334 break;
5335 case nir_intrinsic_ssbo_atomic_comp_swap:
5336 op32 = aco_opcode::buffer_atomic_cmpswap;
5337 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
5338 break;
5339 default:
5340 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
5341 }
5342 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
5343 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
5344 mubuf->operands[0] = Operand(rsrc);
5345 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5346 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
5347 mubuf->operands[3] = Operand(data);
5348 if (return_previous)
5349 mubuf->definitions[0] = Definition(dst);
5350 mubuf->offset = 0;
5351 mubuf->offen = (offset.type() == RegType::vgpr);
5352 mubuf->glc = return_previous;
5353 mubuf->dlc = false; /* Not needed for atomics */
5354 mubuf->disable_wqm = true;
5355 mubuf->barrier = barrier_buffer;
5356 ctx->program->needs_exact = true;
5357 ctx->block->instructions.emplace_back(std::move(mubuf));
5358 }
5359
5360 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
5361
5362 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5363 Builder bld(ctx->program, ctx->block);
5364 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
5365 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
5366 }
5367
5368 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
5369 {
5370 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5371 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5372
5373 if (addr.type() == RegType::vgpr)
5374 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
5375 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
5376 }
5377
5378 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
5379 {
5380 Builder bld(ctx->program, ctx->block);
5381 unsigned num_components = instr->num_components;
5382 unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
5383
5384 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5385 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5386
5387 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
5388 bool dlc = glc && ctx->options->chip_class >= GFX10;
5389 aco_opcode op;
5390 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
5391 bool global = ctx->options->chip_class >= GFX9;
5392
5393 if (ctx->options->chip_class >= GFX7) {
5394 aco_opcode op;
5395 switch (num_bytes) {
5396 case 4:
5397 op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
5398 break;
5399 case 8:
5400 op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
5401 break;
5402 case 12:
5403 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
5404 break;
5405 case 16:
5406 op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
5407 break;
5408 default:
5409 unreachable("load_global not implemented for this size.");
5410 }
5411
5412 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
5413 flat->operands[0] = Operand(addr);
5414 flat->operands[1] = Operand(s1);
5415 flat->glc = glc;
5416 flat->dlc = dlc;
5417 flat->barrier = barrier_buffer;
5418
5419 if (dst.type() == RegType::sgpr) {
5420 Temp vec = bld.tmp(RegType::vgpr, dst.size());
5421 flat->definitions[0] = Definition(vec);
5422 ctx->block->instructions.emplace_back(std::move(flat));
5423 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
5424 } else {
5425 flat->definitions[0] = Definition(dst);
5426 ctx->block->instructions.emplace_back(std::move(flat));
5427 }
5428 emit_split_vector(ctx, dst, num_components);
5429 } else {
5430 assert(ctx->options->chip_class == GFX6);
5431
5432 /* GFX6 doesn't support loading vec3, expand to vec4. */
5433 num_bytes = num_bytes == 12 ? 16 : num_bytes;
5434
5435 aco_opcode op;
5436 switch (num_bytes) {
5437 case 4:
5438 op = aco_opcode::buffer_load_dword;
5439 break;
5440 case 8:
5441 op = aco_opcode::buffer_load_dwordx2;
5442 break;
5443 case 16:
5444 op = aco_opcode::buffer_load_dwordx4;
5445 break;
5446 default:
5447 unreachable("load_global not implemented for this size.");
5448 }
5449
5450 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
5451
5452 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
5453 mubuf->operands[0] = Operand(rsrc);
5454 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
5455 mubuf->operands[2] = Operand(0u);
5456 mubuf->glc = glc;
5457 mubuf->dlc = false;
5458 mubuf->offset = 0;
5459 mubuf->addr64 = addr.type() == RegType::vgpr;
5460 mubuf->disable_wqm = false;
5461 mubuf->barrier = barrier_buffer;
5462 aco_ptr<Instruction> instr = std::move(mubuf);
5463
5464 /* expand vector */
5465 if (dst.size() == 3) {
5466 Temp vec = bld.tmp(v4);
5467 instr->definitions[0] = Definition(vec);
5468 bld.insert(std::move(instr));
5469 emit_split_vector(ctx, vec, 4);
5470
5471 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
5472 instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
5473 instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
5474 instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
5475 }
5476
5477 if (dst.type() == RegType::sgpr) {
5478 Temp vec = bld.tmp(RegType::vgpr, dst.size());
5479 instr->definitions[0] = Definition(vec);
5480 bld.insert(std::move(instr));
5481 expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
5482 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
5483 } else {
5484 instr->definitions[0] = Definition(dst);
5485 bld.insert(std::move(instr));
5486 emit_split_vector(ctx, dst, num_components);
5487 }
5488 }
5489 } else {
5490 switch (num_bytes) {
5491 case 4:
5492 op = aco_opcode::s_load_dword;
5493 break;
5494 case 8:
5495 op = aco_opcode::s_load_dwordx2;
5496 break;
5497 case 12:
5498 case 16:
5499 op = aco_opcode::s_load_dwordx4;
5500 break;
5501 default:
5502 unreachable("load_global not implemented for this size.");
5503 }
5504 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
5505 load->operands[0] = Operand(addr);
5506 load->operands[1] = Operand(0u);
5507 load->definitions[0] = Definition(dst);
5508 load->glc = glc;
5509 load->dlc = dlc;
5510 load->barrier = barrier_buffer;
5511 assert(ctx->options->chip_class >= GFX8 || !glc);
5512
5513 if (dst.size() == 3) {
5514 /* trim vector */
5515 Temp vec = bld.tmp(s4);
5516 load->definitions[0] = Definition(vec);
5517 ctx->block->instructions.emplace_back(std::move(load));
5518 emit_split_vector(ctx, vec, 4);
5519
5520 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5521 emit_extract_vector(ctx, vec, 0, s1),
5522 emit_extract_vector(ctx, vec, 1, s1),
5523 emit_extract_vector(ctx, vec, 2, s1));
5524 } else {
5525 ctx->block->instructions.emplace_back(std::move(load));
5526 }
5527 }
5528 }
5529
5530 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
5531 {
5532 Builder bld(ctx->program, ctx->block);
5533 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5534
5535 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5536 Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
5537
5538 if (ctx->options->chip_class >= GFX7)
5539 addr = as_vgpr(ctx, addr);
5540
5541 unsigned writemask = nir_intrinsic_write_mask(instr);
5542 while (writemask) {
5543 int start, count;
5544 u_bit_scan_consecutive_range(&writemask, &start, &count);
5545 if (count == 3 && ctx->options->chip_class == GFX6) {
5546 /* GFX6 doesn't support storing vec3, split it. */
5547 writemask |= 1u << (start + 2);
5548 count = 2;
5549 }
5550 unsigned num_bytes = count * elem_size_bytes;
5551
5552 Temp write_data = data;
5553 if (count != instr->num_components) {
5554 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5555 for (int i = 0; i < count; i++)
5556 vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
5557 write_data = bld.tmp(RegType::vgpr, count);
5558 vec->definitions[0] = Definition(write_data);
5559 ctx->block->instructions.emplace_back(std::move(vec));
5560 }
5561
5562 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
5563 unsigned offset = start * elem_size_bytes;
5564
5565 if (ctx->options->chip_class >= GFX7) {
5566 if (offset > 0 && ctx->options->chip_class < GFX9) {
5567 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
5568 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
5569 Temp carry = bld.tmp(bld.lm);
5570 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
5571
5572 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
5573 Operand(offset), addr0);
5574 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
5575 Operand(0u), addr1,
5576 carry).def(1).setHint(vcc);
5577
5578 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
5579
5580 offset = 0;
5581 }
5582
5583 bool global = ctx->options->chip_class >= GFX9;
5584 aco_opcode op;
5585 switch (num_bytes) {
5586 case 4:
5587 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
5588 break;
5589 case 8:
5590 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
5591 break;
5592 case 12:
5593 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
5594 break;
5595 case 16:
5596 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
5597 break;
5598 default:
5599 unreachable("store_global not implemented for this size.");
5600 }
5601
5602 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
5603 flat->operands[0] = Operand(addr);
5604 flat->operands[1] = Operand(s1);
5605 flat->operands[2] = Operand(data);
5606 flat->glc = glc;
5607 flat->dlc = false;
5608 flat->offset = offset;
5609 flat->disable_wqm = true;
5610 flat->barrier = barrier_buffer;
5611 ctx->program->needs_exact = true;
5612 ctx->block->instructions.emplace_back(std::move(flat));
5613 } else {
5614 assert(ctx->options->chip_class == GFX6);
5615
5616 aco_opcode op;
5617 switch (num_bytes) {
5618 case 4:
5619 op = aco_opcode::buffer_store_dword;
5620 break;
5621 case 8:
5622 op = aco_opcode::buffer_store_dwordx2;
5623 break;
5624 case 16:
5625 op = aco_opcode::buffer_store_dwordx4;
5626 break;
5627 default:
5628 unreachable("store_global not implemented for this size.");
5629 }
5630
5631 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
5632
5633 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
5634 mubuf->operands[0] = Operand(rsrc);
5635 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
5636 mubuf->operands[2] = Operand(0u);
5637 mubuf->operands[3] = Operand(write_data);
5638 mubuf->glc = glc;
5639 mubuf->dlc = false;
5640 mubuf->offset = offset;
5641 mubuf->addr64 = addr.type() == RegType::vgpr;
5642 mubuf->disable_wqm = true;
5643 mubuf->barrier = barrier_buffer;
5644 ctx->program->needs_exact = true;
5645 ctx->block->instructions.emplace_back(std::move(mubuf));
5646 }
5647 }
5648 }
5649
5650 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5651 {
5652 /* return the previous value if dest is ever used */
5653 bool return_previous = false;
5654 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5655 return_previous = true;
5656 break;
5657 }
5658 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5659 return_previous = true;
5660 break;
5661 }
5662
5663 Builder bld(ctx->program, ctx->block);
5664 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5665 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5666
5667 if (ctx->options->chip_class >= GFX7)
5668 addr = as_vgpr(ctx, addr);
5669
5670 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
5671 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
5672 get_ssa_temp(ctx, instr->src[2].ssa), data);
5673
5674 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5675
5676 aco_opcode op32, op64;
5677
5678 if (ctx->options->chip_class >= GFX7) {
5679 bool global = ctx->options->chip_class >= GFX9;
5680 switch (instr->intrinsic) {
5681 case nir_intrinsic_global_atomic_add:
5682 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
5683 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
5684 break;
5685 case nir_intrinsic_global_atomic_imin:
5686 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
5687 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
5688 break;
5689 case nir_intrinsic_global_atomic_umin:
5690 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
5691 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
5692 break;
5693 case nir_intrinsic_global_atomic_imax:
5694 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
5695 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
5696 break;
5697 case nir_intrinsic_global_atomic_umax:
5698 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
5699 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
5700 break;
5701 case nir_intrinsic_global_atomic_and:
5702 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
5703 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
5704 break;
5705 case nir_intrinsic_global_atomic_or:
5706 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
5707 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
5708 break;
5709 case nir_intrinsic_global_atomic_xor:
5710 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
5711 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
5712 break;
5713 case nir_intrinsic_global_atomic_exchange:
5714 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
5715 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
5716 break;
5717 case nir_intrinsic_global_atomic_comp_swap:
5718 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
5719 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
5720 break;
5721 default:
5722 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
5723 }
5724
5725 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
5726 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
5727 flat->operands[0] = Operand(addr);
5728 flat->operands[1] = Operand(s1);
5729 flat->operands[2] = Operand(data);
5730 if (return_previous)
5731 flat->definitions[0] = Definition(dst);
5732 flat->glc = return_previous;
5733 flat->dlc = false; /* Not needed for atomics */
5734 flat->offset = 0;
5735 flat->disable_wqm = true;
5736 flat->barrier = barrier_buffer;
5737 ctx->program->needs_exact = true;
5738 ctx->block->instructions.emplace_back(std::move(flat));
5739 } else {
5740 assert(ctx->options->chip_class == GFX6);
5741
5742 switch (instr->intrinsic) {
5743 case nir_intrinsic_global_atomic_add:
5744 op32 = aco_opcode::buffer_atomic_add;
5745 op64 = aco_opcode::buffer_atomic_add_x2;
5746 break;
5747 case nir_intrinsic_global_atomic_imin:
5748 op32 = aco_opcode::buffer_atomic_smin;
5749 op64 = aco_opcode::buffer_atomic_smin_x2;
5750 break;
5751 case nir_intrinsic_global_atomic_umin:
5752 op32 = aco_opcode::buffer_atomic_umin;
5753 op64 = aco_opcode::buffer_atomic_umin_x2;
5754 break;
5755 case nir_intrinsic_global_atomic_imax:
5756 op32 = aco_opcode::buffer_atomic_smax;
5757 op64 = aco_opcode::buffer_atomic_smax_x2;
5758 break;
5759 case nir_intrinsic_global_atomic_umax:
5760 op32 = aco_opcode::buffer_atomic_umax;
5761 op64 = aco_opcode::buffer_atomic_umax_x2;
5762 break;
5763 case nir_intrinsic_global_atomic_and:
5764 op32 = aco_opcode::buffer_atomic_and;
5765 op64 = aco_opcode::buffer_atomic_and_x2;
5766 break;
5767 case nir_intrinsic_global_atomic_or:
5768 op32 = aco_opcode::buffer_atomic_or;
5769 op64 = aco_opcode::buffer_atomic_or_x2;
5770 break;
5771 case nir_intrinsic_global_atomic_xor:
5772 op32 = aco_opcode::buffer_atomic_xor;
5773 op64 = aco_opcode::buffer_atomic_xor_x2;
5774 break;
5775 case nir_intrinsic_global_atomic_exchange:
5776 op32 = aco_opcode::buffer_atomic_swap;
5777 op64 = aco_opcode::buffer_atomic_swap_x2;
5778 break;
5779 case nir_intrinsic_global_atomic_comp_swap:
5780 op32 = aco_opcode::buffer_atomic_cmpswap;
5781 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
5782 break;
5783 default:
5784 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
5785 }
5786
5787 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
5788
5789 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
5790
5791 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
5792 mubuf->operands[0] = Operand(rsrc);
5793 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
5794 mubuf->operands[2] = Operand(0u);
5795 mubuf->operands[3] = Operand(data);
5796 if (return_previous)
5797 mubuf->definitions[0] = Definition(dst);
5798 mubuf->glc = return_previous;
5799 mubuf->dlc = false;
5800 mubuf->offset = 0;
5801 mubuf->addr64 = addr.type() == RegType::vgpr;
5802 mubuf->disable_wqm = true;
5803 mubuf->barrier = barrier_buffer;
5804 ctx->program->needs_exact = true;
5805 ctx->block->instructions.emplace_back(std::move(mubuf));
5806 }
5807 }
5808
5809 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
5810 Builder bld(ctx->program, ctx->block);
5811 switch(instr->intrinsic) {
5812 case nir_intrinsic_group_memory_barrier:
5813 case nir_intrinsic_memory_barrier:
5814 bld.barrier(aco_opcode::p_memory_barrier_common);
5815 break;
5816 case nir_intrinsic_memory_barrier_buffer:
5817 bld.barrier(aco_opcode::p_memory_barrier_buffer);
5818 break;
5819 case nir_intrinsic_memory_barrier_image:
5820 bld.barrier(aco_opcode::p_memory_barrier_image);
5821 break;
5822 case nir_intrinsic_memory_barrier_tcs_patch:
5823 case nir_intrinsic_memory_barrier_shared:
5824 bld.barrier(aco_opcode::p_memory_barrier_shared);
5825 break;
5826 default:
5827 unreachable("Unimplemented memory barrier intrinsic");
5828 break;
5829 }
5830 }
5831
5832 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
5833 {
5834 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
5835 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5836 assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
5837 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5838 Builder bld(ctx->program, ctx->block);
5839
5840 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5841 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
5842 load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
5843 }
5844
5845 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
5846 {
5847 unsigned writemask = nir_intrinsic_write_mask(instr);
5848 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
5849 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5850 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5851 assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
5852
5853 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
5854 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
5855 }
5856
5857 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5858 {
5859 unsigned offset = nir_intrinsic_base(instr);
5860 Operand m = load_lds_size_m0(ctx);
5861 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5862 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5863
5864 unsigned num_operands = 3;
5865 aco_opcode op32, op64, op32_rtn, op64_rtn;
5866 switch(instr->intrinsic) {
5867 case nir_intrinsic_shared_atomic_add:
5868 op32 = aco_opcode::ds_add_u32;
5869 op64 = aco_opcode::ds_add_u64;
5870 op32_rtn = aco_opcode::ds_add_rtn_u32;
5871 op64_rtn = aco_opcode::ds_add_rtn_u64;
5872 break;
5873 case nir_intrinsic_shared_atomic_imin:
5874 op32 = aco_opcode::ds_min_i32;
5875 op64 = aco_opcode::ds_min_i64;
5876 op32_rtn = aco_opcode::ds_min_rtn_i32;
5877 op64_rtn = aco_opcode::ds_min_rtn_i64;
5878 break;
5879 case nir_intrinsic_shared_atomic_umin:
5880 op32 = aco_opcode::ds_min_u32;
5881 op64 = aco_opcode::ds_min_u64;
5882 op32_rtn = aco_opcode::ds_min_rtn_u32;
5883 op64_rtn = aco_opcode::ds_min_rtn_u64;
5884 break;
5885 case nir_intrinsic_shared_atomic_imax:
5886 op32 = aco_opcode::ds_max_i32;
5887 op64 = aco_opcode::ds_max_i64;
5888 op32_rtn = aco_opcode::ds_max_rtn_i32;
5889 op64_rtn = aco_opcode::ds_max_rtn_i64;
5890 break;
5891 case nir_intrinsic_shared_atomic_umax:
5892 op32 = aco_opcode::ds_max_u32;
5893 op64 = aco_opcode::ds_max_u64;
5894 op32_rtn = aco_opcode::ds_max_rtn_u32;
5895 op64_rtn = aco_opcode::ds_max_rtn_u64;
5896 break;
5897 case nir_intrinsic_shared_atomic_and:
5898 op32 = aco_opcode::ds_and_b32;
5899 op64 = aco_opcode::ds_and_b64;
5900 op32_rtn = aco_opcode::ds_and_rtn_b32;
5901 op64_rtn = aco_opcode::ds_and_rtn_b64;
5902 break;
5903 case nir_intrinsic_shared_atomic_or:
5904 op32 = aco_opcode::ds_or_b32;
5905 op64 = aco_opcode::ds_or_b64;
5906 op32_rtn = aco_opcode::ds_or_rtn_b32;
5907 op64_rtn = aco_opcode::ds_or_rtn_b64;
5908 break;
5909 case nir_intrinsic_shared_atomic_xor:
5910 op32 = aco_opcode::ds_xor_b32;
5911 op64 = aco_opcode::ds_xor_b64;
5912 op32_rtn = aco_opcode::ds_xor_rtn_b32;
5913 op64_rtn = aco_opcode::ds_xor_rtn_b64;
5914 break;
5915 case nir_intrinsic_shared_atomic_exchange:
5916 op32 = aco_opcode::ds_write_b32;
5917 op64 = aco_opcode::ds_write_b64;
5918 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
5919 op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
5920 break;
5921 case nir_intrinsic_shared_atomic_comp_swap:
5922 op32 = aco_opcode::ds_cmpst_b32;
5923 op64 = aco_opcode::ds_cmpst_b64;
5924 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
5925 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
5926 num_operands = 4;
5927 break;
5928 default:
5929 unreachable("Unhandled shared atomic intrinsic");
5930 }
5931
5932 /* return the previous value if dest is ever used */
5933 bool return_previous = false;
5934 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5935 return_previous = true;
5936 break;
5937 }
5938 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5939 return_previous = true;
5940 break;
5941 }
5942
5943 aco_opcode op;
5944 if (data.size() == 1) {
5945 assert(instr->dest.ssa.bit_size == 32);
5946 op = return_previous ? op32_rtn : op32;
5947 } else {
5948 assert(instr->dest.ssa.bit_size == 64);
5949 op = return_previous ? op64_rtn : op64;
5950 }
5951
5952 if (offset > 65535) {
5953 Builder bld(ctx->program, ctx->block);
5954 address = bld.vadd32(bld.def(v1), Operand(offset), address);
5955 offset = 0;
5956 }
5957
5958 aco_ptr<DS_instruction> ds;
5959 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
5960 ds->operands[0] = Operand(address);
5961 ds->operands[1] = Operand(data);
5962 if (num_operands == 4)
5963 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
5964 ds->operands[num_operands - 1] = m;
5965 ds->offset0 = offset;
5966 if (return_previous)
5967 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
5968 ctx->block->instructions.emplace_back(std::move(ds));
5969 }
5970
5971 Temp get_scratch_resource(isel_context *ctx)
5972 {
5973 Builder bld(ctx->program, ctx->block);
5974 Temp scratch_addr = ctx->program->private_segment_buffer;
5975 if (ctx->stage != compute_cs)
5976 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
5977
5978 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
5979 S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
5980
5981 if (ctx->program->chip_class >= GFX10) {
5982 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5983 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5984 S_008F0C_RESOURCE_LEVEL(1);
5985 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
5986 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5987 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5988 }
5989
5990 /* older generations need element size = 16 bytes. element size removed in GFX9 */
5991 if (ctx->program->chip_class <= GFX8)
5992 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
5993
5994 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
5995 }
5996
5997 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5998 assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
5999 Builder bld(ctx->program, ctx->block);
6000 Temp rsrc = get_scratch_resource(ctx);
6001 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6002 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6003
6004 aco_opcode op;
6005 switch (dst.size()) {
6006 case 1:
6007 op = aco_opcode::buffer_load_dword;
6008 break;
6009 case 2:
6010 op = aco_opcode::buffer_load_dwordx2;
6011 break;
6012 case 3:
6013 op = aco_opcode::buffer_load_dwordx3;
6014 break;
6015 case 4:
6016 op = aco_opcode::buffer_load_dwordx4;
6017 break;
6018 case 6:
6019 case 8: {
6020 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
6021 Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
6022 bld.def(v4), rsrc, offset,
6023 ctx->program->scratch_offset, 0, true);
6024 Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
6025 aco_opcode::buffer_load_dwordx4,
6026 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
6027 rsrc, offset, ctx->program->scratch_offset, 16, true);
6028 emit_split_vector(ctx, lower, 2);
6029 elems[0] = emit_extract_vector(ctx, lower, 0, v2);
6030 elems[1] = emit_extract_vector(ctx, lower, 1, v2);
6031 if (dst.size() == 8) {
6032 emit_split_vector(ctx, upper, 2);
6033 elems[2] = emit_extract_vector(ctx, upper, 0, v2);
6034 elems[3] = emit_extract_vector(ctx, upper, 1, v2);
6035 } else {
6036 elems[2] = upper;
6037 }
6038
6039 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6040 Format::PSEUDO, dst.size() / 2, 1)};
6041 for (unsigned i = 0; i < dst.size() / 2; i++)
6042 vec->operands[i] = Operand(elems[i]);
6043 vec->definitions[0] = Definition(dst);
6044 bld.insert(std::move(vec));
6045 ctx->allocated_vec.emplace(dst.id(), elems);
6046 return;
6047 }
6048 default:
6049 unreachable("Wrong dst size for nir_intrinsic_load_scratch");
6050 }
6051
6052 bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true);
6053 emit_split_vector(ctx, dst, instr->num_components);
6054 }
6055
6056 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6057 assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
6058 Builder bld(ctx->program, ctx->block);
6059 Temp rsrc = get_scratch_resource(ctx);
6060 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6061 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6062
6063 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6064 unsigned writemask = nir_intrinsic_write_mask(instr);
6065
6066 while (writemask) {
6067 int start, count;
6068 u_bit_scan_consecutive_range(&writemask, &start, &count);
6069 int num_bytes = count * elem_size_bytes;
6070
6071 if (num_bytes > 16) {
6072 assert(elem_size_bytes == 8);
6073 writemask |= (((count - 2) << 1) - 1) << (start + 2);
6074 count = 2;
6075 num_bytes = 16;
6076 }
6077
6078 // TODO: check alignment of sub-dword stores
6079 // TODO: split 3 bytes. there is no store instruction for that
6080
6081 Temp write_data;
6082 if (count != instr->num_components) {
6083 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
6084 for (int i = 0; i < count; i++) {
6085 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
6086 vec->operands[i] = Operand(elem);
6087 }
6088 write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
6089 vec->definitions[0] = Definition(write_data);
6090 ctx->block->instructions.emplace_back(std::move(vec));
6091 } else {
6092 write_data = data;
6093 }
6094
6095 aco_opcode op;
6096 switch (num_bytes) {
6097 case 4:
6098 op = aco_opcode::buffer_store_dword;
6099 break;
6100 case 8:
6101 op = aco_opcode::buffer_store_dwordx2;
6102 break;
6103 case 12:
6104 op = aco_opcode::buffer_store_dwordx3;
6105 break;
6106 case 16:
6107 op = aco_opcode::buffer_store_dwordx4;
6108 break;
6109 default:
6110 unreachable("Invalid data size for nir_intrinsic_store_scratch.");
6111 }
6112
6113 bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
6114 }
6115 }
6116
6117 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6118 uint8_t log2_ps_iter_samples;
6119 if (ctx->program->info->ps.force_persample) {
6120 log2_ps_iter_samples =
6121 util_logbase2(ctx->options->key.fs.num_samples);
6122 } else {
6123 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6124 }
6125
6126 /* The bit pattern matches that used by fixed function fragment
6127 * processing. */
6128 static const unsigned ps_iter_masks[] = {
6129 0xffff, /* not used */
6130 0x5555,
6131 0x1111,
6132 0x0101,
6133 0x0001,
6134 };
6135 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6136
6137 Builder bld(ctx->program, ctx->block);
6138
6139 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6140 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6141 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6142 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6143 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6144 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6145 }
6146
6147 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6148 Builder bld(ctx->program, ctx->block);
6149
6150 unsigned stream = nir_intrinsic_stream_id(instr);
6151 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6152 next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6153 nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6154
6155 /* get GSVS ring */
6156 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6157
6158 unsigned num_components =
6159 ctx->program->info->gs.num_stream_output_components[stream];
6160 assert(num_components);
6161
6162 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6163 unsigned stream_offset = 0;
6164 for (unsigned i = 0; i < stream; i++) {
6165 unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6166 stream_offset += prev_stride * ctx->program->wave_size;
6167 }
6168
6169 /* Limit on the stride field for <= GFX7. */
6170 assert(stride < (1 << 14));
6171
6172 Temp gsvs_dwords[4];
6173 for (unsigned i = 0; i < 4; i++)
6174 gsvs_dwords[i] = bld.tmp(s1);
6175 bld.pseudo(aco_opcode::p_split_vector,
6176 Definition(gsvs_dwords[0]),
6177 Definition(gsvs_dwords[1]),
6178 Definition(gsvs_dwords[2]),
6179 Definition(gsvs_dwords[3]),
6180 gsvs_ring);
6181
6182 if (stream_offset) {
6183 Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6184
6185 Temp carry = bld.tmp(s1);
6186 gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6187 gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6188 }
6189
6190 gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6191 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6192
6193 gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6194 gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6195
6196 unsigned offset = 0;
6197 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6198 if (ctx->program->info->gs.output_streams[i] != stream)
6199 continue;
6200
6201 for (unsigned j = 0; j < 4; j++) {
6202 if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6203 continue;
6204
6205 if (ctx->outputs.mask[i] & (1 << j)) {
6206 Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6207 unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6208 if (const_offset >= 4096u) {
6209 if (vaddr_offset.isUndefined())
6210 vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
6211 else
6212 vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
6213 const_offset %= 4096u;
6214 }
6215
6216 aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
6217 mtbuf->operands[0] = Operand(gsvs_ring);
6218 mtbuf->operands[1] = vaddr_offset;
6219 mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
6220 mtbuf->operands[3] = Operand(ctx->outputs.outputs[i][j]);
6221 mtbuf->offen = !vaddr_offset.isUndefined();
6222 mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
6223 mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
6224 mtbuf->offset = const_offset;
6225 mtbuf->glc = true;
6226 mtbuf->slc = true;
6227 mtbuf->barrier = barrier_gs_data;
6228 mtbuf->can_reorder = true;
6229 bld.insert(std::move(mtbuf));
6230 }
6231
6232 offset += ctx->shader->info.gs.vertices_out;
6233 }
6234
6235 /* outputs for the next vertex are undefined and keeping them around can
6236 * create invalid IR with control flow */
6237 ctx->outputs.mask[i] = 0;
6238 }
6239
6240 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
6241 }
6242
6243 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
6244 {
6245 Builder bld(ctx->program, ctx->block);
6246
6247 if (cluster_size == 1) {
6248 return src;
6249 } if (op == nir_op_iand && cluster_size == 4) {
6250 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
6251 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
6252 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
6253 bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
6254 } else if (op == nir_op_ior && cluster_size == 4) {
6255 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
6256 return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
6257 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
6258 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
6259 //subgroupAnd(val) -> (exec & ~val) == 0
6260 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
6261 Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
6262 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
6263 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
6264 //subgroupOr(val) -> (val & exec) != 0
6265 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
6266 return bool_to_vector_condition(ctx, tmp);
6267 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
6268 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
6269 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6270 tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
6271 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
6272 return bool_to_vector_condition(ctx, tmp);
6273 } else {
6274 //subgroupClustered{And,Or,Xor}(val, n) ->
6275 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ; just v_mbcnt_lo_u32_b32 on wave32
6276 //cluster_offset = ~(n - 1) & lane_id
6277 //cluster_mask = ((1 << n) - 1)
6278 //subgroupClusteredAnd():
6279 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
6280 //subgroupClusteredOr():
6281 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
6282 //subgroupClusteredXor():
6283 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
6284 Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
6285 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
6286
6287 Temp tmp;
6288 if (op == nir_op_iand)
6289 tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6290 else
6291 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6292
6293 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
6294
6295 if (ctx->program->chip_class <= GFX7)
6296 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
6297 else if (ctx->program->wave_size == 64)
6298 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
6299 else
6300 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
6301 tmp = emit_extract_vector(ctx, tmp, 0, v1);
6302 if (cluster_mask != 0xffffffff)
6303 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
6304
6305 Definition cmp_def = Definition();
6306 if (op == nir_op_iand) {
6307 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
6308 } else if (op == nir_op_ior) {
6309 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
6310 } else if (op == nir_op_ixor) {
6311 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
6312 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
6313 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
6314 }
6315 cmp_def.setHint(vcc);
6316 return cmp_def.getTemp();
6317 }
6318 }
6319
6320 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
6321 {
6322 Builder bld(ctx->program, ctx->block);
6323
6324 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
6325 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
6326 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
6327 Temp tmp;
6328 if (op == nir_op_iand)
6329 tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
6330 else
6331 tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
6332
6333 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
6334 Temp lo = lohi.def(0).getTemp();
6335 Temp hi = lohi.def(1).getTemp();
6336 Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
6337
6338 Definition cmp_def = Definition();
6339 if (op == nir_op_iand)
6340 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
6341 else if (op == nir_op_ior)
6342 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
6343 else if (op == nir_op_ixor)
6344 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
6345 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
6346 cmp_def.setHint(vcc);
6347 return cmp_def.getTemp();
6348 }
6349
6350 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
6351 {
6352 Builder bld(ctx->program, ctx->block);
6353
6354 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
6355 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
6356 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
6357 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
6358 if (op == nir_op_iand)
6359 return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
6360 else if (op == nir_op_ior)
6361 return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
6362 else if (op == nir_op_ixor)
6363 return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
6364
6365 assert(false);
6366 return Temp();
6367 }
6368
6369 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
6370 {
6371 Builder bld(ctx->program, ctx->block);
6372 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
6373 if (src.regClass().type() == RegType::vgpr) {
6374 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
6375 } else if (src.regClass() == s1) {
6376 bld.sop1(aco_opcode::s_mov_b32, dst, src);
6377 } else if (src.regClass() == s2) {
6378 bld.sop1(aco_opcode::s_mov_b64, dst, src);
6379 } else {
6380 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6381 nir_print_instr(&instr->instr, stderr);
6382 fprintf(stderr, "\n");
6383 }
6384 }
6385
6386 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
6387 {
6388 Builder bld(ctx->program, ctx->block);
6389 Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
6390 Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
6391 Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
6392
6393 Temp ddx_1, ddx_2, ddy_1, ddy_2;
6394 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
6395 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
6396 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
6397
6398 /* Build DD X/Y */
6399 if (ctx->program->chip_class >= GFX8) {
6400 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
6401 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
6402 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
6403 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
6404 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
6405 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
6406 } else {
6407 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
6408 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
6409 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
6410 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
6411 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
6412 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
6413 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
6414 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
6415 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
6416 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
6417 }
6418
6419 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
6420 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
6421 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
6422 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
6423 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
6424 Temp wqm1 = bld.tmp(v1);
6425 emit_wqm(ctx, tmp1, wqm1, true);
6426 Temp wqm2 = bld.tmp(v1);
6427 emit_wqm(ctx, tmp2, wqm2, true);
6428 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
6429 return;
6430 }
6431
6432 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
6433 {
6434 Builder bld(ctx->program, ctx->block);
6435 switch(instr->intrinsic) {
6436 case nir_intrinsic_load_barycentric_sample:
6437 case nir_intrinsic_load_barycentric_pixel:
6438 case nir_intrinsic_load_barycentric_centroid: {
6439 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
6440 Temp bary = Temp(0, s2);
6441 switch (mode) {
6442 case INTERP_MODE_SMOOTH:
6443 case INTERP_MODE_NONE:
6444 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
6445 bary = get_arg(ctx, ctx->args->ac.persp_center);
6446 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
6447 bary = ctx->persp_centroid;
6448 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
6449 bary = get_arg(ctx, ctx->args->ac.persp_sample);
6450 break;
6451 case INTERP_MODE_NOPERSPECTIVE:
6452 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
6453 bary = get_arg(ctx, ctx->args->ac.linear_center);
6454 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
6455 bary = ctx->linear_centroid;
6456 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
6457 bary = get_arg(ctx, ctx->args->ac.linear_sample);
6458 break;
6459 default:
6460 break;
6461 }
6462 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6463 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
6464 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
6465 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6466 Operand(p1), Operand(p2));
6467 emit_split_vector(ctx, dst, 2);
6468 break;
6469 }
6470 case nir_intrinsic_load_barycentric_model: {
6471 Temp model = get_arg(ctx, ctx->args->ac.pull_model);
6472
6473 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6474 Temp p1 = emit_extract_vector(ctx, model, 0, v1);
6475 Temp p2 = emit_extract_vector(ctx, model, 1, v1);
6476 Temp p3 = emit_extract_vector(ctx, model, 2, v1);
6477 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6478 Operand(p1), Operand(p2), Operand(p3));
6479 emit_split_vector(ctx, dst, 3);
6480 break;
6481 }
6482 case nir_intrinsic_load_barycentric_at_sample: {
6483 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
6484 switch (ctx->options->key.fs.num_samples) {
6485 case 2: sample_pos_offset += 1 << 3; break;
6486 case 4: sample_pos_offset += 3 << 3; break;
6487 case 8: sample_pos_offset += 7 << 3; break;
6488 default: break;
6489 }
6490 Temp sample_pos;
6491 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6492 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
6493 Temp private_segment_buffer = ctx->program->private_segment_buffer;
6494 if (addr.type() == RegType::sgpr) {
6495 Operand offset;
6496 if (const_addr) {
6497 sample_pos_offset += const_addr->u32 << 3;
6498 offset = Operand(sample_pos_offset);
6499 } else if (ctx->options->chip_class >= GFX9) {
6500 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
6501 } else {
6502 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
6503 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
6504 }
6505
6506 Operand off = bld.copy(bld.def(s1), Operand(offset));
6507 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
6508
6509 } else if (ctx->options->chip_class >= GFX9) {
6510 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
6511 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
6512 } else if (ctx->options->chip_class >= GFX7) {
6513 /* addr += private_segment_buffer + sample_pos_offset */
6514 Temp tmp0 = bld.tmp(s1);
6515 Temp tmp1 = bld.tmp(s1);
6516 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
6517 Definition scc_tmp = bld.def(s1, scc);
6518 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
6519 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
6520 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
6521 Temp pck0 = bld.tmp(v1);
6522 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
6523 tmp1 = as_vgpr(ctx, tmp1);
6524 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
6525 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
6526
6527 /* sample_pos = flat_load_dwordx2 addr */
6528 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
6529 } else {
6530 assert(ctx->options->chip_class == GFX6);
6531
6532 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6533 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6534 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
6535
6536 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
6537 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
6538
6539 sample_pos = bld.tmp(v2);
6540
6541 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
6542 load->definitions[0] = Definition(sample_pos);
6543 load->operands[0] = Operand(rsrc);
6544 load->operands[1] = Operand(addr);
6545 load->operands[2] = Operand(0u);
6546 load->offset = sample_pos_offset;
6547 load->offen = 0;
6548 load->addr64 = true;
6549 load->glc = false;
6550 load->dlc = false;
6551 load->disable_wqm = false;
6552 load->barrier = barrier_none;
6553 load->can_reorder = true;
6554 ctx->block->instructions.emplace_back(std::move(load));
6555 }
6556
6557 /* sample_pos -= 0.5 */
6558 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
6559 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
6560 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
6561 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
6562 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
6563
6564 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
6565 break;
6566 }
6567 case nir_intrinsic_load_barycentric_at_offset: {
6568 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
6569 RegClass rc = RegClass(offset.type(), 1);
6570 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
6571 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
6572 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
6573 break;
6574 }
6575 case nir_intrinsic_load_front_face: {
6576 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6577 Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
6578 break;
6579 }
6580 case nir_intrinsic_load_view_index: {
6581 if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
6582 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6583 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
6584 break;
6585 }
6586
6587 /* fallthrough */
6588 }
6589 case nir_intrinsic_load_layer_id: {
6590 unsigned idx = nir_intrinsic_base(instr);
6591 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6592 Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
6593 break;
6594 }
6595 case nir_intrinsic_load_frag_coord: {
6596 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
6597 break;
6598 }
6599 case nir_intrinsic_load_sample_pos: {
6600 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
6601 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
6602 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6603 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
6604 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
6605 break;
6606 }
6607 case nir_intrinsic_load_tess_coord:
6608 visit_load_tess_coord(ctx, instr);
6609 break;
6610 case nir_intrinsic_load_interpolated_input:
6611 visit_load_interpolated_input(ctx, instr);
6612 break;
6613 case nir_intrinsic_store_output:
6614 visit_store_output(ctx, instr);
6615 break;
6616 case nir_intrinsic_load_input:
6617 case nir_intrinsic_load_input_vertex:
6618 visit_load_input(ctx, instr);
6619 break;
6620 case nir_intrinsic_load_output:
6621 visit_load_output(ctx, instr);
6622 break;
6623 case nir_intrinsic_load_per_vertex_input:
6624 visit_load_per_vertex_input(ctx, instr);
6625 break;
6626 case nir_intrinsic_load_per_vertex_output:
6627 visit_load_per_vertex_output(ctx, instr);
6628 break;
6629 case nir_intrinsic_store_per_vertex_output:
6630 visit_store_per_vertex_output(ctx, instr);
6631 break;
6632 case nir_intrinsic_load_ubo:
6633 visit_load_ubo(ctx, instr);
6634 break;
6635 case nir_intrinsic_load_push_constant:
6636 visit_load_push_constant(ctx, instr);
6637 break;
6638 case nir_intrinsic_load_constant:
6639 visit_load_constant(ctx, instr);
6640 break;
6641 case nir_intrinsic_vulkan_resource_index:
6642 visit_load_resource(ctx, instr);
6643 break;
6644 case nir_intrinsic_discard:
6645 visit_discard(ctx, instr);
6646 break;
6647 case nir_intrinsic_discard_if:
6648 visit_discard_if(ctx, instr);
6649 break;
6650 case nir_intrinsic_load_shared:
6651 visit_load_shared(ctx, instr);
6652 break;
6653 case nir_intrinsic_store_shared:
6654 visit_store_shared(ctx, instr);
6655 break;
6656 case nir_intrinsic_shared_atomic_add:
6657 case nir_intrinsic_shared_atomic_imin:
6658 case nir_intrinsic_shared_atomic_umin:
6659 case nir_intrinsic_shared_atomic_imax:
6660 case nir_intrinsic_shared_atomic_umax:
6661 case nir_intrinsic_shared_atomic_and:
6662 case nir_intrinsic_shared_atomic_or:
6663 case nir_intrinsic_shared_atomic_xor:
6664 case nir_intrinsic_shared_atomic_exchange:
6665 case nir_intrinsic_shared_atomic_comp_swap:
6666 visit_shared_atomic(ctx, instr);
6667 break;
6668 case nir_intrinsic_image_deref_load:
6669 visit_image_load(ctx, instr);
6670 break;
6671 case nir_intrinsic_image_deref_store:
6672 visit_image_store(ctx, instr);
6673 break;
6674 case nir_intrinsic_image_deref_atomic_add:
6675 case nir_intrinsic_image_deref_atomic_umin:
6676 case nir_intrinsic_image_deref_atomic_imin:
6677 case nir_intrinsic_image_deref_atomic_umax:
6678 case nir_intrinsic_image_deref_atomic_imax:
6679 case nir_intrinsic_image_deref_atomic_and:
6680 case nir_intrinsic_image_deref_atomic_or:
6681 case nir_intrinsic_image_deref_atomic_xor:
6682 case nir_intrinsic_image_deref_atomic_exchange:
6683 case nir_intrinsic_image_deref_atomic_comp_swap:
6684 visit_image_atomic(ctx, instr);
6685 break;
6686 case nir_intrinsic_image_deref_size:
6687 visit_image_size(ctx, instr);
6688 break;
6689 case nir_intrinsic_load_ssbo:
6690 visit_load_ssbo(ctx, instr);
6691 break;
6692 case nir_intrinsic_store_ssbo:
6693 visit_store_ssbo(ctx, instr);
6694 break;
6695 case nir_intrinsic_load_global:
6696 visit_load_global(ctx, instr);
6697 break;
6698 case nir_intrinsic_store_global:
6699 visit_store_global(ctx, instr);
6700 break;
6701 case nir_intrinsic_global_atomic_add:
6702 case nir_intrinsic_global_atomic_imin:
6703 case nir_intrinsic_global_atomic_umin:
6704 case nir_intrinsic_global_atomic_imax:
6705 case nir_intrinsic_global_atomic_umax:
6706 case nir_intrinsic_global_atomic_and:
6707 case nir_intrinsic_global_atomic_or:
6708 case nir_intrinsic_global_atomic_xor:
6709 case nir_intrinsic_global_atomic_exchange:
6710 case nir_intrinsic_global_atomic_comp_swap:
6711 visit_global_atomic(ctx, instr);
6712 break;
6713 case nir_intrinsic_ssbo_atomic_add:
6714 case nir_intrinsic_ssbo_atomic_imin:
6715 case nir_intrinsic_ssbo_atomic_umin:
6716 case nir_intrinsic_ssbo_atomic_imax:
6717 case nir_intrinsic_ssbo_atomic_umax:
6718 case nir_intrinsic_ssbo_atomic_and:
6719 case nir_intrinsic_ssbo_atomic_or:
6720 case nir_intrinsic_ssbo_atomic_xor:
6721 case nir_intrinsic_ssbo_atomic_exchange:
6722 case nir_intrinsic_ssbo_atomic_comp_swap:
6723 visit_atomic_ssbo(ctx, instr);
6724 break;
6725 case nir_intrinsic_load_scratch:
6726 visit_load_scratch(ctx, instr);
6727 break;
6728 case nir_intrinsic_store_scratch:
6729 visit_store_scratch(ctx, instr);
6730 break;
6731 case nir_intrinsic_get_buffer_size:
6732 visit_get_buffer_size(ctx, instr);
6733 break;
6734 case nir_intrinsic_control_barrier: {
6735 if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
6736 /* GFX6 only (thanks to a hw bug workaround):
6737 * The real barrier instruction isn’t needed, because an entire patch
6738 * always fits into a single wave.
6739 */
6740 break;
6741 }
6742
6743 if (ctx->shader->info.stage == MESA_SHADER_COMPUTE) {
6744 unsigned* bsize = ctx->program->info->cs.block_size;
6745 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
6746 if (workgroup_size > ctx->program->wave_size)
6747 bld.sopp(aco_opcode::s_barrier);
6748 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
6749 /* For each patch provided during rendering, n​ TCS shader invocations will be processed,
6750 * where n​ is the number of vertices in the output patch.
6751 */
6752 unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
6753 if (workgroup_size > ctx->program->wave_size)
6754 bld.sopp(aco_opcode::s_barrier);
6755 } else {
6756 /* We don't know the workgroup size, so always emit the s_barrier. */
6757 bld.sopp(aco_opcode::s_barrier);
6758 }
6759
6760 break;
6761 }
6762 case nir_intrinsic_memory_barrier_tcs_patch:
6763 case nir_intrinsic_group_memory_barrier:
6764 case nir_intrinsic_memory_barrier:
6765 case nir_intrinsic_memory_barrier_buffer:
6766 case nir_intrinsic_memory_barrier_image:
6767 case nir_intrinsic_memory_barrier_shared:
6768 emit_memory_barrier(ctx, instr);
6769 break;
6770 case nir_intrinsic_load_num_work_groups: {
6771 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6772 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
6773 emit_split_vector(ctx, dst, 3);
6774 break;
6775 }
6776 case nir_intrinsic_load_local_invocation_id: {
6777 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6778 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
6779 emit_split_vector(ctx, dst, 3);
6780 break;
6781 }
6782 case nir_intrinsic_load_work_group_id: {
6783 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6784 struct ac_arg *args = ctx->args->ac.workgroup_ids;
6785 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6786 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
6787 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
6788 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
6789 emit_split_vector(ctx, dst, 3);
6790 break;
6791 }
6792 case nir_intrinsic_load_local_invocation_index: {
6793 Temp id = emit_mbcnt(ctx, bld.def(v1));
6794
6795 /* The tg_size bits [6:11] contain the subgroup id,
6796 * we need this multiplied by the wave size, and then OR the thread id to it.
6797 */
6798 if (ctx->program->wave_size == 64) {
6799 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
6800 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
6801 get_arg(ctx, ctx->args->ac.tg_size));
6802 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
6803 } else {
6804 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
6805 Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
6806 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
6807 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
6808 }
6809 break;
6810 }
6811 case nir_intrinsic_load_subgroup_id: {
6812 if (ctx->stage == compute_cs) {
6813 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
6814 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
6815 } else {
6816 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
6817 }
6818 break;
6819 }
6820 case nir_intrinsic_load_subgroup_invocation: {
6821 emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
6822 break;
6823 }
6824 case nir_intrinsic_load_num_subgroups: {
6825 if (ctx->stage == compute_cs)
6826 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
6827 get_arg(ctx, ctx->args->ac.tg_size));
6828 else
6829 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
6830 break;
6831 }
6832 case nir_intrinsic_ballot: {
6833 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6834 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6835 Definition tmp = bld.def(dst.regClass());
6836 Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
6837 if (instr->src[0].ssa->bit_size == 1) {
6838 assert(src.regClass() == bld.lm);
6839 bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
6840 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
6841 bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
6842 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
6843 bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
6844 } else {
6845 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6846 nir_print_instr(&instr->instr, stderr);
6847 fprintf(stderr, "\n");
6848 }
6849 if (dst.size() != bld.lm.size()) {
6850 /* Wave32 with ballot size set to 64 */
6851 bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
6852 }
6853 emit_wqm(ctx, tmp.getTemp(), dst);
6854 break;
6855 }
6856 case nir_intrinsic_shuffle:
6857 case nir_intrinsic_read_invocation: {
6858 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6859 if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
6860 emit_uniform_subgroup(ctx, instr, src);
6861 } else {
6862 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
6863 if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
6864 tid = bld.as_uniform(tid);
6865 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6866 if (src.regClass() == v1) {
6867 emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
6868 } else if (src.regClass() == v2) {
6869 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
6870 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
6871 lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
6872 hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
6873 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6874 emit_split_vector(ctx, dst, 2);
6875 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
6876 assert(src.regClass() == bld.lm);
6877 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
6878 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
6879 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
6880 assert(src.regClass() == bld.lm);
6881 Temp tmp;
6882 if (ctx->program->chip_class <= GFX7)
6883 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
6884 else if (ctx->program->wave_size == 64)
6885 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
6886 else
6887 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
6888 tmp = emit_extract_vector(ctx, tmp, 0, v1);
6889 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
6890 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
6891 } else {
6892 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6893 nir_print_instr(&instr->instr, stderr);
6894 fprintf(stderr, "\n");
6895 }
6896 }
6897 break;
6898 }
6899 case nir_intrinsic_load_sample_id: {
6900 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6901 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6902 break;
6903 }
6904 case nir_intrinsic_load_sample_mask_in: {
6905 visit_load_sample_mask_in(ctx, instr);
6906 break;
6907 }
6908 case nir_intrinsic_read_first_invocation: {
6909 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6910 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6911 if (src.regClass() == v1) {
6912 emit_wqm(ctx,
6913 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
6914 dst);
6915 } else if (src.regClass() == v2) {
6916 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
6917 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
6918 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
6919 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
6920 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6921 emit_split_vector(ctx, dst, 2);
6922 } else if (instr->dest.ssa.bit_size == 1) {
6923 assert(src.regClass() == bld.lm);
6924 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
6925 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
6926 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
6927 } else if (src.regClass() == s1) {
6928 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
6929 } else if (src.regClass() == s2) {
6930 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
6931 } else {
6932 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6933 nir_print_instr(&instr->instr, stderr);
6934 fprintf(stderr, "\n");
6935 }
6936 break;
6937 }
6938 case nir_intrinsic_vote_all: {
6939 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6940 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6941 assert(src.regClass() == bld.lm);
6942 assert(dst.regClass() == bld.lm);
6943
6944 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
6945 Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
6946 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
6947 break;
6948 }
6949 case nir_intrinsic_vote_any: {
6950 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6951 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6952 assert(src.regClass() == bld.lm);
6953 assert(dst.regClass() == bld.lm);
6954
6955 Temp tmp = bool_to_scalar_condition(ctx, src);
6956 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
6957 break;
6958 }
6959 case nir_intrinsic_reduce:
6960 case nir_intrinsic_inclusive_scan:
6961 case nir_intrinsic_exclusive_scan: {
6962 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6963 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6964 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
6965 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
6966 nir_intrinsic_cluster_size(instr) : 0;
6967 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
6968
6969 if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
6970 emit_uniform_subgroup(ctx, instr, src);
6971 } else if (instr->dest.ssa.bit_size == 1) {
6972 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
6973 op = nir_op_iand;
6974 else if (op == nir_op_iadd)
6975 op = nir_op_ixor;
6976 else if (op == nir_op_umax || op == nir_op_imax)
6977 op = nir_op_ior;
6978 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
6979
6980 switch (instr->intrinsic) {
6981 case nir_intrinsic_reduce:
6982 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
6983 break;
6984 case nir_intrinsic_exclusive_scan:
6985 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
6986 break;
6987 case nir_intrinsic_inclusive_scan:
6988 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
6989 break;
6990 default:
6991 assert(false);
6992 }
6993 } else if (cluster_size == 1) {
6994 bld.copy(Definition(dst), src);
6995 } else {
6996 src = as_vgpr(ctx, src);
6997
6998 ReduceOp reduce_op;
6999 switch (op) {
7000 #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
7001 CASE(iadd)
7002 CASE(imul)
7003 CASE(fadd)
7004 CASE(fmul)
7005 CASE(imin)
7006 CASE(umin)
7007 CASE(fmin)
7008 CASE(imax)
7009 CASE(umax)
7010 CASE(fmax)
7011 CASE(iand)
7012 CASE(ior)
7013 CASE(ixor)
7014 default:
7015 unreachable("unknown reduction op");
7016 #undef CASE
7017 }
7018
7019 aco_opcode aco_op;
7020 switch (instr->intrinsic) {
7021 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7022 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7023 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7024 default:
7025 unreachable("unknown reduce intrinsic");
7026 }
7027
7028 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7029 reduce->operands[0] = Operand(src);
7030 // filled in by aco_reduce_assign.cpp, used internally as part of the
7031 // reduce sequence
7032 assert(dst.size() == 1 || dst.size() == 2);
7033 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7034 reduce->operands[2] = Operand(v1.as_linear());
7035
7036 Temp tmp_dst = bld.tmp(dst.regClass());
7037 reduce->definitions[0] = Definition(tmp_dst);
7038 reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7039 reduce->definitions[2] = Definition();
7040 reduce->definitions[3] = Definition(scc, s1);
7041 reduce->definitions[4] = Definition();
7042 reduce->reduce_op = reduce_op;
7043 reduce->cluster_size = cluster_size;
7044 ctx->block->instructions.emplace_back(std::move(reduce));
7045
7046 emit_wqm(ctx, tmp_dst, dst);
7047 }
7048 break;
7049 }
7050 case nir_intrinsic_quad_broadcast: {
7051 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7052 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7053 emit_uniform_subgroup(ctx, instr, src);
7054 } else {
7055 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7056 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7057 uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7058
7059 if (instr->dest.ssa.bit_size == 1) {
7060 assert(src.regClass() == bld.lm);
7061 assert(dst.regClass() == bld.lm);
7062 uint32_t half_mask = 0x11111111u << lane;
7063 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7064 Temp tmp = bld.tmp(bld.lm);
7065 bld.sop1(Builder::s_wqm, Definition(tmp),
7066 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7067 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7068 emit_wqm(ctx, tmp, dst);
7069 } else if (instr->dest.ssa.bit_size == 32) {
7070 if (ctx->program->chip_class >= GFX8)
7071 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7072 else
7073 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7074 } else if (instr->dest.ssa.bit_size == 64) {
7075 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7076 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7077 if (ctx->program->chip_class >= GFX8) {
7078 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7079 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7080 } else {
7081 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7082 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7083 }
7084 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7085 emit_split_vector(ctx, dst, 2);
7086 } else {
7087 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7088 nir_print_instr(&instr->instr, stderr);
7089 fprintf(stderr, "\n");
7090 }
7091 }
7092 break;
7093 }
7094 case nir_intrinsic_quad_swap_horizontal:
7095 case nir_intrinsic_quad_swap_vertical:
7096 case nir_intrinsic_quad_swap_diagonal:
7097 case nir_intrinsic_quad_swizzle_amd: {
7098 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7099 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7100 emit_uniform_subgroup(ctx, instr, src);
7101 break;
7102 }
7103 uint16_t dpp_ctrl = 0;
7104 switch (instr->intrinsic) {
7105 case nir_intrinsic_quad_swap_horizontal:
7106 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7107 break;
7108 case nir_intrinsic_quad_swap_vertical:
7109 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7110 break;
7111 case nir_intrinsic_quad_swap_diagonal:
7112 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7113 break;
7114 case nir_intrinsic_quad_swizzle_amd:
7115 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7116 break;
7117 default:
7118 break;
7119 }
7120 if (ctx->program->chip_class < GFX8)
7121 dpp_ctrl |= (1 << 15);
7122
7123 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7124 if (instr->dest.ssa.bit_size == 1) {
7125 assert(src.regClass() == bld.lm);
7126 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7127 if (ctx->program->chip_class >= GFX8)
7128 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7129 else
7130 src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7131 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7132 emit_wqm(ctx, tmp, dst);
7133 } else if (instr->dest.ssa.bit_size == 32) {
7134 Temp tmp;
7135 if (ctx->program->chip_class >= GFX8)
7136 tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7137 else
7138 tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7139 emit_wqm(ctx, tmp, dst);
7140 } else if (instr->dest.ssa.bit_size == 64) {
7141 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7142 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7143 if (ctx->program->chip_class >= GFX8) {
7144 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7145 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7146 } else {
7147 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7148 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7149 }
7150 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7151 emit_split_vector(ctx, dst, 2);
7152 } else {
7153 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7154 nir_print_instr(&instr->instr, stderr);
7155 fprintf(stderr, "\n");
7156 }
7157 break;
7158 }
7159 case nir_intrinsic_masked_swizzle_amd: {
7160 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7161 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7162 emit_uniform_subgroup(ctx, instr, src);
7163 break;
7164 }
7165 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7166 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7167 if (dst.regClass() == v1) {
7168 emit_wqm(ctx,
7169 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
7170 dst);
7171 } else if (dst.regClass() == v2) {
7172 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7173 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7174 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
7175 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
7176 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7177 emit_split_vector(ctx, dst, 2);
7178 } else {
7179 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7180 nir_print_instr(&instr->instr, stderr);
7181 fprintf(stderr, "\n");
7182 }
7183 break;
7184 }
7185 case nir_intrinsic_write_invocation_amd: {
7186 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7187 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7188 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
7189 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7190 if (dst.regClass() == v1) {
7191 /* src2 is ignored for writelane. RA assigns the same reg for dst */
7192 emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
7193 } else if (dst.regClass() == v2) {
7194 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
7195 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
7196 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
7197 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
7198 Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
7199 Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
7200 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7201 emit_split_vector(ctx, dst, 2);
7202 } else {
7203 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7204 nir_print_instr(&instr->instr, stderr);
7205 fprintf(stderr, "\n");
7206 }
7207 break;
7208 }
7209 case nir_intrinsic_mbcnt_amd: {
7210 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7211 RegClass rc = RegClass(src.type(), 1);
7212 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
7213 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
7214 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7215 Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
7216 emit_wqm(ctx, wqm_tmp, dst);
7217 break;
7218 }
7219 case nir_intrinsic_load_helper_invocation: {
7220 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7221 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
7222 ctx->block->kind |= block_kind_needs_lowering;
7223 ctx->program->needs_exact = true;
7224 break;
7225 }
7226 case nir_intrinsic_is_helper_invocation: {
7227 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7228 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
7229 ctx->block->kind |= block_kind_needs_lowering;
7230 ctx->program->needs_exact = true;
7231 break;
7232 }
7233 case nir_intrinsic_demote:
7234 bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
7235
7236 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
7237 ctx->cf_info.exec_potentially_empty_discard = true;
7238 ctx->block->kind |= block_kind_uses_demote;
7239 ctx->program->needs_exact = true;
7240 break;
7241 case nir_intrinsic_demote_if: {
7242 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7243 assert(src.regClass() == bld.lm);
7244 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7245 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
7246
7247 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
7248 ctx->cf_info.exec_potentially_empty_discard = true;
7249 ctx->block->kind |= block_kind_uses_demote;
7250 ctx->program->needs_exact = true;
7251 break;
7252 }
7253 case nir_intrinsic_first_invocation: {
7254 emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
7255 get_ssa_temp(ctx, &instr->dest.ssa));
7256 break;
7257 }
7258 case nir_intrinsic_shader_clock:
7259 bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
7260 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
7261 break;
7262 case nir_intrinsic_load_vertex_id_zero_base: {
7263 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7264 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
7265 break;
7266 }
7267 case nir_intrinsic_load_first_vertex: {
7268 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7269 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
7270 break;
7271 }
7272 case nir_intrinsic_load_base_instance: {
7273 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7274 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
7275 break;
7276 }
7277 case nir_intrinsic_load_instance_id: {
7278 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7279 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
7280 break;
7281 }
7282 case nir_intrinsic_load_draw_id: {
7283 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7284 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
7285 break;
7286 }
7287 case nir_intrinsic_load_invocation_id: {
7288 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7289
7290 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
7291 if (ctx->options->chip_class >= GFX10)
7292 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
7293 else
7294 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
7295 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7296 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
7297 get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
7298 } else {
7299 unreachable("Unsupported stage for load_invocation_id");
7300 }
7301
7302 break;
7303 }
7304 case nir_intrinsic_load_primitive_id: {
7305 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7306
7307 switch (ctx->shader->info.stage) {
7308 case MESA_SHADER_GEOMETRY:
7309 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
7310 break;
7311 case MESA_SHADER_TESS_CTRL:
7312 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
7313 break;
7314 case MESA_SHADER_TESS_EVAL:
7315 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
7316 break;
7317 default:
7318 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
7319 }
7320
7321 break;
7322 }
7323 case nir_intrinsic_load_patch_vertices_in: {
7324 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
7325 ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
7326
7327 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7328 bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
7329 break;
7330 }
7331 case nir_intrinsic_emit_vertex_with_counter: {
7332 visit_emit_vertex_with_counter(ctx, instr);
7333 break;
7334 }
7335 case nir_intrinsic_end_primitive_with_counter: {
7336 unsigned stream = nir_intrinsic_stream_id(instr);
7337 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
7338 break;
7339 }
7340 case nir_intrinsic_set_vertex_count: {
7341 /* unused, the HW keeps track of this for us */
7342 break;
7343 }
7344 default:
7345 fprintf(stderr, "Unimplemented intrinsic instr: ");
7346 nir_print_instr(&instr->instr, stderr);
7347 fprintf(stderr, "\n");
7348 abort();
7349
7350 break;
7351 }
7352 }
7353
7354
7355 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
7356 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
7357 enum glsl_base_type *stype)
7358 {
7359 nir_deref_instr *texture_deref_instr = NULL;
7360 nir_deref_instr *sampler_deref_instr = NULL;
7361 int plane = -1;
7362
7363 for (unsigned i = 0; i < instr->num_srcs; i++) {
7364 switch (instr->src[i].src_type) {
7365 case nir_tex_src_texture_deref:
7366 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
7367 break;
7368 case nir_tex_src_sampler_deref:
7369 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
7370 break;
7371 case nir_tex_src_plane:
7372 plane = nir_src_as_int(instr->src[i].src);
7373 break;
7374 default:
7375 break;
7376 }
7377 }
7378
7379 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
7380
7381 if (!sampler_deref_instr)
7382 sampler_deref_instr = texture_deref_instr;
7383
7384 if (plane >= 0) {
7385 assert(instr->op != nir_texop_txf_ms &&
7386 instr->op != nir_texop_samples_identical);
7387 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
7388 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
7389 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
7390 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
7391 } else if (instr->op == nir_texop_fragment_mask_fetch) {
7392 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
7393 } else {
7394 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
7395 }
7396 if (samp_ptr) {
7397 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
7398
7399 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
7400 /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
7401 Builder bld(ctx->program, ctx->block);
7402
7403 /* to avoid unnecessary moves, we split and recombine sampler and image */
7404 Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
7405 bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
7406 Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
7407 bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
7408 Definition(img[2]), Definition(img[3]), Definition(img[4]),
7409 Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
7410 bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
7411 Definition(samp[2]), Definition(samp[3]), *samp_ptr);
7412
7413 samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
7414 *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
7415 img[0], img[1], img[2], img[3],
7416 img[4], img[5], img[6], img[7]);
7417 *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
7418 samp[0], samp[1], samp[2], samp[3]);
7419 }
7420 }
7421 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
7422 instr->op == nir_texop_samples_identical))
7423 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
7424 }
7425
7426 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
7427 Temp *out_ma, Temp *out_sc, Temp *out_tc)
7428 {
7429 Builder bld(ctx->program, ctx->block);
7430
7431 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
7432 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
7433 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
7434
7435 Operand neg_one(0xbf800000u);
7436 Operand one(0x3f800000u);
7437 Operand two(0x40000000u);
7438 Operand four(0x40800000u);
7439
7440 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
7441 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
7442 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
7443
7444 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
7445 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
7446 is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
7447 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
7448
7449 // select sc
7450 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
7451 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
7452 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
7453 one, is_ma_y);
7454 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
7455
7456 // select tc
7457 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
7458 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
7459 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
7460
7461 // select ma
7462 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7463 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
7464 deriv_z, is_ma_z);
7465 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
7466 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
7467 }
7468
7469 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
7470 {
7471 Builder bld(ctx->program, ctx->block);
7472 Temp ma, tc, sc, id;
7473
7474 if (is_array) {
7475 coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
7476
7477 // see comment in ac_prepare_cube_coords()
7478 if (ctx->options->chip_class <= GFX8)
7479 coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
7480 }
7481
7482 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
7483
7484 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
7485 vop3a->operands[0] = Operand(ma);
7486 vop3a->abs[0] = true;
7487 Temp invma = bld.tmp(v1);
7488 vop3a->definitions[0] = Definition(invma);
7489 ctx->block->instructions.emplace_back(std::move(vop3a));
7490
7491 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
7492 if (!is_deriv)
7493 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
7494
7495 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
7496 if (!is_deriv)
7497 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
7498
7499 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
7500
7501 if (is_deriv) {
7502 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
7503 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
7504
7505 for (unsigned i = 0; i < 2; i++) {
7506 // see comment in ac_prepare_cube_coords()
7507 Temp deriv_ma;
7508 Temp deriv_sc, deriv_tc;
7509 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
7510 &deriv_ma, &deriv_sc, &deriv_tc);
7511
7512 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
7513
7514 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
7515 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
7516 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
7517 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
7518 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
7519 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
7520 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
7521 }
7522
7523 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
7524 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
7525 }
7526
7527 if (is_array)
7528 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
7529 coords.resize(3);
7530 coords[0] = sc;
7531 coords[1] = tc;
7532 coords[2] = id;
7533 }
7534
7535 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
7536 {
7537 if (vec->parent_instr->type != nir_instr_type_alu)
7538 return;
7539 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
7540 if (vec_instr->op != nir_op_vec(vec->num_components))
7541 return;
7542
7543 for (unsigned i = 0; i < vec->num_components; i++) {
7544 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
7545 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
7546 }
7547 }
7548
7549 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
7550 {
7551 Builder bld(ctx->program, ctx->block);
7552 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
7553 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
7554 Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
7555 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
7556 std::vector<Temp> coords;
7557 std::vector<Temp> derivs;
7558 nir_const_value *sample_index_cv = NULL;
7559 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
7560 enum glsl_base_type stype;
7561 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
7562
7563 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
7564 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
7565 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
7566 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
7567
7568 for (unsigned i = 0; i < instr->num_srcs; i++) {
7569 switch (instr->src[i].src_type) {
7570 case nir_tex_src_coord: {
7571 Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
7572 for (unsigned i = 0; i < coord.size(); i++)
7573 coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
7574 break;
7575 }
7576 case nir_tex_src_bias:
7577 if (instr->op == nir_texop_txb) {
7578 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
7579 has_bias = true;
7580 }
7581 break;
7582 case nir_tex_src_lod: {
7583 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
7584
7585 if (val && val->f32 <= 0.0) {
7586 level_zero = true;
7587 } else {
7588 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
7589 has_lod = true;
7590 }
7591 break;
7592 }
7593 case nir_tex_src_comparator:
7594 if (instr->is_shadow) {
7595 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
7596 has_compare = true;
7597 }
7598 break;
7599 case nir_tex_src_offset:
7600 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
7601 get_const_vec(instr->src[i].src.ssa, const_offset);
7602 has_offset = true;
7603 break;
7604 case nir_tex_src_ddx:
7605 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
7606 has_ddx = true;
7607 break;
7608 case nir_tex_src_ddy:
7609 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
7610 has_ddy = true;
7611 break;
7612 case nir_tex_src_ms_index:
7613 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
7614 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
7615 has_sample_index = true;
7616 break;
7617 case nir_tex_src_texture_offset:
7618 case nir_tex_src_sampler_offset:
7619 default:
7620 break;
7621 }
7622 }
7623
7624 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
7625 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
7626
7627 if (instr->op == nir_texop_texture_samples) {
7628 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
7629
7630 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
7631 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
7632 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
7633 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
7634
7635 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7636 samples, Operand(1u), bld.scc(is_msaa));
7637 return;
7638 }
7639
7640 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
7641 aco_ptr<Instruction> tmp_instr;
7642 Temp acc, pack = Temp();
7643
7644 uint32_t pack_const = 0;
7645 for (unsigned i = 0; i < offset.size(); i++) {
7646 if (!const_offset[i])
7647 continue;
7648 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
7649 }
7650
7651 if (offset.type() == RegType::sgpr) {
7652 for (unsigned i = 0; i < offset.size(); i++) {
7653 if (const_offset[i])
7654 continue;
7655
7656 acc = emit_extract_vector(ctx, offset, i, s1);
7657 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
7658
7659 if (i) {
7660 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
7661 }
7662
7663 if (pack == Temp()) {
7664 pack = acc;
7665 } else {
7666 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
7667 }
7668 }
7669
7670 if (pack_const && pack != Temp())
7671 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
7672 } else {
7673 for (unsigned i = 0; i < offset.size(); i++) {
7674 if (const_offset[i])
7675 continue;
7676
7677 acc = emit_extract_vector(ctx, offset, i, v1);
7678 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
7679
7680 if (i) {
7681 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
7682 }
7683
7684 if (pack == Temp()) {
7685 pack = acc;
7686 } else {
7687 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
7688 }
7689 }
7690
7691 if (pack_const && pack != Temp())
7692 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
7693 }
7694 if (pack_const && pack == Temp())
7695 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
7696 else if (pack == Temp())
7697 has_offset = false;
7698 else
7699 offset = pack;
7700 }
7701
7702 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
7703 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
7704
7705 /* pack derivatives */
7706 if (has_ddx || has_ddy) {
7707 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
7708 assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
7709 Temp zero = bld.copy(bld.def(v1), Operand(0u));
7710 derivs = {ddy, zero, ddy, zero};
7711 } else {
7712 for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
7713 derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
7714 for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
7715 derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
7716 }
7717 has_derivs = true;
7718 }
7719
7720 if (instr->coord_components > 1 &&
7721 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
7722 instr->is_array &&
7723 instr->op != nir_texop_txf)
7724 coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
7725
7726 if (instr->coord_components > 2 &&
7727 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
7728 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
7729 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
7730 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
7731 instr->is_array &&
7732 instr->op != nir_texop_txf &&
7733 instr->op != nir_texop_txf_ms &&
7734 instr->op != nir_texop_fragment_fetch &&
7735 instr->op != nir_texop_fragment_mask_fetch)
7736 coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
7737
7738 if (ctx->options->chip_class == GFX9 &&
7739 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
7740 instr->op != nir_texop_lod && instr->coord_components) {
7741 assert(coords.size() > 0 && coords.size() < 3);
7742
7743 coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
7744 Operand((uint32_t) 0) :
7745 Operand((uint32_t) 0x3f000000)));
7746 }
7747
7748 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
7749
7750 if (instr->op == nir_texop_samples_identical)
7751 resource = fmask_ptr;
7752
7753 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
7754 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
7755 instr->op != nir_texop_txs &&
7756 instr->op != nir_texop_fragment_fetch &&
7757 instr->op != nir_texop_fragment_mask_fetch) {
7758 assert(has_sample_index);
7759 Operand op(sample_index);
7760 if (sample_index_cv)
7761 op = Operand(sample_index_cv->u32);
7762 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
7763 }
7764
7765 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
7766 for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
7767 Temp off = emit_extract_vector(ctx, offset, i, v1);
7768 coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
7769 }
7770 has_offset = false;
7771 }
7772
7773 /* Build tex instruction */
7774 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
7775 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
7776 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
7777 : 0;
7778 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7779 Temp tmp_dst = dst;
7780
7781 /* gather4 selects the component by dmask and always returns vec4 */
7782 if (instr->op == nir_texop_tg4) {
7783 assert(instr->dest.ssa.num_components == 4);
7784 if (instr->is_shadow)
7785 dmask = 1;
7786 else
7787 dmask = 1 << instr->component;
7788 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
7789 tmp_dst = bld.tmp(v4);
7790 } else if (instr->op == nir_texop_samples_identical) {
7791 tmp_dst = bld.tmp(v1);
7792 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
7793 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
7794 }
7795
7796 aco_ptr<MIMG_instruction> tex;
7797 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
7798 if (!has_lod)
7799 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
7800
7801 bool div_by_6 = instr->op == nir_texop_txs &&
7802 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
7803 instr->is_array &&
7804 (dmask & (1 << 2));
7805 if (tmp_dst.id() == dst.id() && div_by_6)
7806 tmp_dst = bld.tmp(tmp_dst.regClass());
7807
7808 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
7809 tex->operands[0] = Operand(resource);
7810 tex->operands[1] = Operand(s4); /* no sampler */
7811 tex->operands[2] = Operand(as_vgpr(ctx,lod));
7812 if (ctx->options->chip_class == GFX9 &&
7813 instr->op == nir_texop_txs &&
7814 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
7815 instr->is_array) {
7816 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
7817 } else if (instr->op == nir_texop_query_levels) {
7818 tex->dmask = 1 << 3;
7819 } else {
7820 tex->dmask = dmask;
7821 }
7822 tex->da = da;
7823 tex->definitions[0] = Definition(tmp_dst);
7824 tex->dim = dim;
7825 tex->can_reorder = true;
7826 ctx->block->instructions.emplace_back(std::move(tex));
7827
7828 if (div_by_6) {
7829 /* divide 3rd value by 6 by multiplying with magic number */
7830 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
7831 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
7832 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
7833 assert(instr->dest.ssa.num_components == 3);
7834 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
7835 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
7836 emit_extract_vector(ctx, tmp_dst, 0, v1),
7837 emit_extract_vector(ctx, tmp_dst, 1, v1),
7838 by_6);
7839
7840 }
7841
7842 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
7843 return;
7844 }
7845
7846 Temp tg4_compare_cube_wa64 = Temp();
7847
7848 if (tg4_integer_workarounds) {
7849 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
7850 tex->operands[0] = Operand(resource);
7851 tex->operands[1] = Operand(s4); /* no sampler */
7852 tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
7853 tex->dim = dim;
7854 tex->dmask = 0x3;
7855 tex->da = da;
7856 Temp size = bld.tmp(v2);
7857 tex->definitions[0] = Definition(size);
7858 tex->can_reorder = true;
7859 ctx->block->instructions.emplace_back(std::move(tex));
7860 emit_split_vector(ctx, size, size.size());
7861
7862 Temp half_texel[2];
7863 for (unsigned i = 0; i < 2; i++) {
7864 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
7865 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
7866 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
7867 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
7868 }
7869
7870 Temp new_coords[2] = {
7871 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
7872 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
7873 };
7874
7875 if (tg4_integer_cube_workaround) {
7876 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
7877 Temp desc[resource.size()];
7878 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
7879 Format::PSEUDO, 1, resource.size())};
7880 split->operands[0] = Operand(resource);
7881 for (unsigned i = 0; i < resource.size(); i++) {
7882 desc[i] = bld.tmp(s1);
7883 split->definitions[i] = Definition(desc[i]);
7884 }
7885 ctx->block->instructions.emplace_back(std::move(split));
7886
7887 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
7888 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
7889 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
7890
7891 Temp nfmt;
7892 if (stype == GLSL_TYPE_UINT) {
7893 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
7894 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
7895 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
7896 bld.scc(compare_cube_wa));
7897 } else {
7898 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
7899 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
7900 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
7901 bld.scc(compare_cube_wa));
7902 }
7903 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
7904 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
7905
7906 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
7907
7908 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
7909 Operand((uint32_t)C_008F14_NUM_FORMAT));
7910 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
7911
7912 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
7913 Format::PSEUDO, resource.size(), 1)};
7914 for (unsigned i = 0; i < resource.size(); i++)
7915 vec->operands[i] = Operand(desc[i]);
7916 resource = bld.tmp(resource.regClass());
7917 vec->definitions[0] = Definition(resource);
7918 ctx->block->instructions.emplace_back(std::move(vec));
7919
7920 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7921 new_coords[0], coords[0], tg4_compare_cube_wa64);
7922 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7923 new_coords[1], coords[1], tg4_compare_cube_wa64);
7924 }
7925 coords[0] = new_coords[0];
7926 coords[1] = new_coords[1];
7927 }
7928
7929 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
7930 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
7931
7932 assert(coords.size() == 1);
7933 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
7934 aco_opcode op;
7935 switch (last_bit) {
7936 case 1:
7937 op = aco_opcode::buffer_load_format_x; break;
7938 case 2:
7939 op = aco_opcode::buffer_load_format_xy; break;
7940 case 3:
7941 op = aco_opcode::buffer_load_format_xyz; break;
7942 case 4:
7943 op = aco_opcode::buffer_load_format_xyzw; break;
7944 default:
7945 unreachable("Tex instruction loads more than 4 components.");
7946 }
7947
7948 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
7949 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
7950 tmp_dst = dst;
7951 else
7952 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
7953
7954 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
7955 mubuf->operands[0] = Operand(resource);
7956 mubuf->operands[1] = Operand(coords[0]);
7957 mubuf->operands[2] = Operand((uint32_t) 0);
7958 mubuf->definitions[0] = Definition(tmp_dst);
7959 mubuf->idxen = true;
7960 mubuf->can_reorder = true;
7961 ctx->block->instructions.emplace_back(std::move(mubuf));
7962
7963 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
7964 return;
7965 }
7966
7967 /* gather MIMG address components */
7968 std::vector<Temp> args;
7969 if (has_offset)
7970 args.emplace_back(offset);
7971 if (has_bias)
7972 args.emplace_back(bias);
7973 if (has_compare)
7974 args.emplace_back(compare);
7975 if (has_derivs)
7976 args.insert(args.end(), derivs.begin(), derivs.end());
7977
7978 args.insert(args.end(), coords.begin(), coords.end());
7979 if (has_sample_index)
7980 args.emplace_back(sample_index);
7981 if (has_lod)
7982 args.emplace_back(lod);
7983
7984 Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
7985 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
7986 vec->definitions[0] = Definition(arg);
7987 for (unsigned i = 0; i < args.size(); i++)
7988 vec->operands[i] = Operand(args[i]);
7989 ctx->block->instructions.emplace_back(std::move(vec));
7990
7991
7992 if (instr->op == nir_texop_txf ||
7993 instr->op == nir_texop_txf_ms ||
7994 instr->op == nir_texop_samples_identical ||
7995 instr->op == nir_texop_fragment_fetch ||
7996 instr->op == nir_texop_fragment_mask_fetch) {
7997 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
7998 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
7999 tex->operands[0] = Operand(resource);
8000 tex->operands[1] = Operand(s4); /* no sampler */
8001 tex->operands[2] = Operand(arg);
8002 tex->dim = dim;
8003 tex->dmask = dmask;
8004 tex->unrm = true;
8005 tex->da = da;
8006 tex->definitions[0] = Definition(tmp_dst);
8007 tex->can_reorder = true;
8008 ctx->block->instructions.emplace_back(std::move(tex));
8009
8010 if (instr->op == nir_texop_samples_identical) {
8011 assert(dmask == 1 && dst.regClass() == v1);
8012 assert(dst.id() != tmp_dst.id());
8013
8014 Temp tmp = bld.tmp(bld.lm);
8015 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8016 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8017
8018 } else {
8019 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8020 }
8021 return;
8022 }
8023
8024 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8025 aco_opcode opcode = aco_opcode::image_sample;
8026 if (has_offset) { /* image_sample_*_o */
8027 if (has_compare) {
8028 opcode = aco_opcode::image_sample_c_o;
8029 if (has_derivs)
8030 opcode = aco_opcode::image_sample_c_d_o;
8031 if (has_bias)
8032 opcode = aco_opcode::image_sample_c_b_o;
8033 if (level_zero)
8034 opcode = aco_opcode::image_sample_c_lz_o;
8035 if (has_lod)
8036 opcode = aco_opcode::image_sample_c_l_o;
8037 } else {
8038 opcode = aco_opcode::image_sample_o;
8039 if (has_derivs)
8040 opcode = aco_opcode::image_sample_d_o;
8041 if (has_bias)
8042 opcode = aco_opcode::image_sample_b_o;
8043 if (level_zero)
8044 opcode = aco_opcode::image_sample_lz_o;
8045 if (has_lod)
8046 opcode = aco_opcode::image_sample_l_o;
8047 }
8048 } else { /* no offset */
8049 if (has_compare) {
8050 opcode = aco_opcode::image_sample_c;
8051 if (has_derivs)
8052 opcode = aco_opcode::image_sample_c_d;
8053 if (has_bias)
8054 opcode = aco_opcode::image_sample_c_b;
8055 if (level_zero)
8056 opcode = aco_opcode::image_sample_c_lz;
8057 if (has_lod)
8058 opcode = aco_opcode::image_sample_c_l;
8059 } else {
8060 opcode = aco_opcode::image_sample;
8061 if (has_derivs)
8062 opcode = aco_opcode::image_sample_d;
8063 if (has_bias)
8064 opcode = aco_opcode::image_sample_b;
8065 if (level_zero)
8066 opcode = aco_opcode::image_sample_lz;
8067 if (has_lod)
8068 opcode = aco_opcode::image_sample_l;
8069 }
8070 }
8071
8072 if (instr->op == nir_texop_tg4) {
8073 if (has_offset) {
8074 opcode = aco_opcode::image_gather4_lz_o;
8075 if (has_compare)
8076 opcode = aco_opcode::image_gather4_c_lz_o;
8077 } else {
8078 opcode = aco_opcode::image_gather4_lz;
8079 if (has_compare)
8080 opcode = aco_opcode::image_gather4_c_lz;
8081 }
8082 } else if (instr->op == nir_texop_lod) {
8083 opcode = aco_opcode::image_get_lod;
8084 }
8085
8086 /* we don't need the bias, sample index, compare value or offset to be
8087 * computed in WQM but if the p_create_vector copies the coordinates, then it
8088 * needs to be in WQM */
8089 if (ctx->stage == fragment_fs &&
8090 !has_derivs && !has_lod && !level_zero &&
8091 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8092 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8093 arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8094
8095 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8096 tex->operands[0] = Operand(resource);
8097 tex->operands[1] = Operand(sampler);
8098 tex->operands[2] = Operand(arg);
8099 tex->dim = dim;
8100 tex->dmask = dmask;
8101 tex->da = da;
8102 tex->definitions[0] = Definition(tmp_dst);
8103 tex->can_reorder = true;
8104 ctx->block->instructions.emplace_back(std::move(tex));
8105
8106 if (tg4_integer_cube_workaround) {
8107 assert(tmp_dst.id() != dst.id());
8108 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
8109
8110 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8111 Temp val[4];
8112 for (unsigned i = 0; i < dst.size(); i++) {
8113 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
8114 Temp cvt_val;
8115 if (stype == GLSL_TYPE_UINT)
8116 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
8117 else
8118 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
8119 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
8120 }
8121 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
8122 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8123 val[0], val[1], val[2], val[3]);
8124 }
8125 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
8126 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
8127
8128 }
8129
8130
8131 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
8132 {
8133 Temp tmp = get_ssa_temp(ctx, ssa);
8134 if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
8135 return Operand(tmp.regClass());
8136 else
8137 return Operand(tmp);
8138 }
8139
8140 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
8141 {
8142 aco_ptr<Pseudo_instruction> phi;
8143 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8144 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
8145
8146 bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
8147 logical |= ctx->block->kind & block_kind_merge;
8148 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
8149
8150 /* we want a sorted list of sources, since the predecessor list is also sorted */
8151 std::map<unsigned, nir_ssa_def*> phi_src;
8152 nir_foreach_phi_src(src, instr)
8153 phi_src[src->pred->index] = src->src.ssa;
8154
8155 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
8156 unsigned num_operands = 0;
8157 Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())];
8158 unsigned num_defined = 0;
8159 unsigned cur_pred_idx = 0;
8160 for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
8161 if (cur_pred_idx < preds.size()) {
8162 /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
8163 unsigned block = ctx->cf_info.nir_to_aco[src.first];
8164 unsigned skipped = 0;
8165 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
8166 skipped++;
8167 if (cur_pred_idx + skipped < preds.size()) {
8168 for (unsigned i = 0; i < skipped; i++)
8169 operands[num_operands++] = Operand(dst.regClass());
8170 cur_pred_idx += skipped;
8171 } else {
8172 continue;
8173 }
8174 }
8175 cur_pred_idx++;
8176 Operand op = get_phi_operand(ctx, src.second);
8177 operands[num_operands++] = op;
8178 num_defined += !op.isUndefined();
8179 }
8180 /* handle block_kind_continue_or_break at loop exit blocks */
8181 while (cur_pred_idx++ < preds.size())
8182 operands[num_operands++] = Operand(dst.regClass());
8183
8184 if (num_defined == 0) {
8185 Builder bld(ctx->program, ctx->block);
8186 if (dst.regClass() == s1) {
8187 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
8188 } else if (dst.regClass() == v1) {
8189 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
8190 } else {
8191 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
8192 for (unsigned i = 0; i < dst.size(); i++)
8193 vec->operands[i] = Operand(0u);
8194 vec->definitions[0] = Definition(dst);
8195 ctx->block->instructions.emplace_back(std::move(vec));
8196 }
8197 return;
8198 }
8199
8200 /* we can use a linear phi in some cases if one src is undef */
8201 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
8202 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
8203
8204 Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
8205 Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
8206 assert(invert->kind & block_kind_invert);
8207
8208 unsigned then_block = invert->linear_preds[0];
8209
8210 Block* insert_block = NULL;
8211 for (unsigned i = 0; i < num_operands; i++) {
8212 Operand op = operands[i];
8213 if (op.isUndefined())
8214 continue;
8215 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
8216 phi->operands[0] = op;
8217 break;
8218 }
8219 assert(insert_block); /* should be handled by the "num_defined == 0" case above */
8220 phi->operands[1] = Operand(dst.regClass());
8221 phi->definitions[0] = Definition(dst);
8222 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
8223 return;
8224 }
8225
8226 /* try to scalarize vector phis */
8227 if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
8228 // TODO: scalarize linear phis on divergent ifs
8229 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
8230 std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
8231 for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
8232 Operand src = operands[i];
8233 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
8234 can_scalarize = false;
8235 }
8236 if (can_scalarize) {
8237 unsigned num_components = instr->dest.ssa.num_components;
8238 assert(dst.size() % num_components == 0);
8239 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
8240
8241 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
8242 for (unsigned k = 0; k < num_components; k++) {
8243 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
8244 for (unsigned i = 0; i < num_operands; i++) {
8245 Operand src = operands[i];
8246 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
8247 }
8248 Temp phi_dst = {ctx->program->allocateId(), rc};
8249 phi->definitions[0] = Definition(phi_dst);
8250 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
8251 new_vec[k] = phi_dst;
8252 vec->operands[k] = Operand(phi_dst);
8253 }
8254 vec->definitions[0] = Definition(dst);
8255 ctx->block->instructions.emplace_back(std::move(vec));
8256 ctx->allocated_vec.emplace(dst.id(), new_vec);
8257 return;
8258 }
8259 }
8260
8261 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
8262 for (unsigned i = 0; i < num_operands; i++)
8263 phi->operands[i] = operands[i];
8264 phi->definitions[0] = Definition(dst);
8265 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
8266 }
8267
8268
8269 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
8270 {
8271 Temp dst = get_ssa_temp(ctx, &instr->def);
8272
8273 assert(dst.type() == RegType::sgpr);
8274
8275 if (dst.size() == 1) {
8276 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
8277 } else {
8278 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
8279 for (unsigned i = 0; i < dst.size(); i++)
8280 vec->operands[i] = Operand(0u);
8281 vec->definitions[0] = Definition(dst);
8282 ctx->block->instructions.emplace_back(std::move(vec));
8283 }
8284 }
8285
8286 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
8287 {
8288 Builder bld(ctx->program, ctx->block);
8289 Block *logical_target;
8290 append_logical_end(ctx->block);
8291 unsigned idx = ctx->block->index;
8292
8293 switch (instr->type) {
8294 case nir_jump_break:
8295 logical_target = ctx->cf_info.parent_loop.exit;
8296 add_logical_edge(idx, logical_target);
8297 ctx->block->kind |= block_kind_break;
8298
8299 if (!ctx->cf_info.parent_if.is_divergent &&
8300 !ctx->cf_info.parent_loop.has_divergent_continue) {
8301 /* uniform break - directly jump out of the loop */
8302 ctx->block->kind |= block_kind_uniform;
8303 ctx->cf_info.has_branch = true;
8304 bld.branch(aco_opcode::p_branch);
8305 add_linear_edge(idx, logical_target);
8306 return;
8307 }
8308 ctx->cf_info.parent_loop.has_divergent_branch = true;
8309 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
8310 break;
8311 case nir_jump_continue:
8312 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
8313 add_logical_edge(idx, logical_target);
8314 ctx->block->kind |= block_kind_continue;
8315
8316 if (ctx->cf_info.parent_if.is_divergent) {
8317 /* for potential uniform breaks after this continue,
8318 we must ensure that they are handled correctly */
8319 ctx->cf_info.parent_loop.has_divergent_continue = true;
8320 ctx->cf_info.parent_loop.has_divergent_branch = true;
8321 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
8322 } else {
8323 /* uniform continue - directly jump to the loop header */
8324 ctx->block->kind |= block_kind_uniform;
8325 ctx->cf_info.has_branch = true;
8326 bld.branch(aco_opcode::p_branch);
8327 add_linear_edge(idx, logical_target);
8328 return;
8329 }
8330 break;
8331 default:
8332 fprintf(stderr, "Unknown NIR jump instr: ");
8333 nir_print_instr(&instr->instr, stderr);
8334 fprintf(stderr, "\n");
8335 abort();
8336 }
8337
8338 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
8339 ctx->cf_info.exec_potentially_empty_break = true;
8340 ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
8341 }
8342
8343 /* remove critical edges from linear CFG */
8344 bld.branch(aco_opcode::p_branch);
8345 Block* break_block = ctx->program->create_and_insert_block();
8346 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8347 break_block->kind |= block_kind_uniform;
8348 add_linear_edge(idx, break_block);
8349 /* the loop_header pointer might be invalidated by this point */
8350 if (instr->type == nir_jump_continue)
8351 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
8352 add_linear_edge(break_block->index, logical_target);
8353 bld.reset(break_block);
8354 bld.branch(aco_opcode::p_branch);
8355
8356 Block* continue_block = ctx->program->create_and_insert_block();
8357 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8358 add_linear_edge(idx, continue_block);
8359 append_logical_start(continue_block);
8360 ctx->block = continue_block;
8361 return;
8362 }
8363
8364 void visit_block(isel_context *ctx, nir_block *block)
8365 {
8366 nir_foreach_instr(instr, block) {
8367 switch (instr->type) {
8368 case nir_instr_type_alu:
8369 visit_alu_instr(ctx, nir_instr_as_alu(instr));
8370 break;
8371 case nir_instr_type_load_const:
8372 visit_load_const(ctx, nir_instr_as_load_const(instr));
8373 break;
8374 case nir_instr_type_intrinsic:
8375 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
8376 break;
8377 case nir_instr_type_tex:
8378 visit_tex(ctx, nir_instr_as_tex(instr));
8379 break;
8380 case nir_instr_type_phi:
8381 visit_phi(ctx, nir_instr_as_phi(instr));
8382 break;
8383 case nir_instr_type_ssa_undef:
8384 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
8385 break;
8386 case nir_instr_type_deref:
8387 break;
8388 case nir_instr_type_jump:
8389 visit_jump(ctx, nir_instr_as_jump(instr));
8390 break;
8391 default:
8392 fprintf(stderr, "Unknown NIR instr type: ");
8393 nir_print_instr(instr, stderr);
8394 fprintf(stderr, "\n");
8395 //abort();
8396 }
8397 }
8398
8399 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8400 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
8401 }
8402
8403
8404
8405 static void visit_loop(isel_context *ctx, nir_loop *loop)
8406 {
8407 //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
8408 append_logical_end(ctx->block);
8409 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
8410 Builder bld(ctx->program, ctx->block);
8411 bld.branch(aco_opcode::p_branch);
8412 unsigned loop_preheader_idx = ctx->block->index;
8413
8414 Block loop_exit = Block();
8415 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
8416 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
8417
8418 Block* loop_header = ctx->program->create_and_insert_block();
8419 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
8420 loop_header->kind |= block_kind_loop_header;
8421 add_edge(loop_preheader_idx, loop_header);
8422 ctx->block = loop_header;
8423
8424 /* emit loop body */
8425 unsigned loop_header_idx = loop_header->index;
8426 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
8427 append_logical_start(ctx->block);
8428 visit_cf_list(ctx, &loop->body);
8429
8430 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
8431 if (!ctx->cf_info.has_branch) {
8432 append_logical_end(ctx->block);
8433 if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
8434 /* Discards can result in code running with an empty exec mask.
8435 * This would result in divergent breaks not ever being taken. As a
8436 * workaround, break the loop when the loop mask is empty instead of
8437 * always continuing. */
8438 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
8439 unsigned block_idx = ctx->block->index;
8440
8441 /* create helper blocks to avoid critical edges */
8442 Block *break_block = ctx->program->create_and_insert_block();
8443 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8444 break_block->kind = block_kind_uniform;
8445 bld.reset(break_block);
8446 bld.branch(aco_opcode::p_branch);
8447 add_linear_edge(block_idx, break_block);
8448 add_linear_edge(break_block->index, &loop_exit);
8449
8450 Block *continue_block = ctx->program->create_and_insert_block();
8451 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8452 continue_block->kind = block_kind_uniform;
8453 bld.reset(continue_block);
8454 bld.branch(aco_opcode::p_branch);
8455 add_linear_edge(block_idx, continue_block);
8456 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
8457
8458 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8459 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
8460 ctx->block = &ctx->program->blocks[block_idx];
8461 } else {
8462 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
8463 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8464 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
8465 else
8466 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
8467 }
8468
8469 bld.reset(ctx->block);
8470 bld.branch(aco_opcode::p_branch);
8471 }
8472
8473 /* fixup phis in loop header from unreachable blocks */
8474 if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
8475 bool linear = ctx->cf_info.has_branch;
8476 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
8477 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
8478 if ((logical && instr->opcode == aco_opcode::p_phi) ||
8479 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
8480 /* the last operand should be the one that needs to be removed */
8481 instr->operands.pop_back();
8482 } else if (!is_phi(instr)) {
8483 break;
8484 }
8485 }
8486 }
8487
8488 ctx->cf_info.has_branch = false;
8489
8490 // TODO: if the loop has not a single exit, we must add one °°
8491 /* emit loop successor block */
8492 ctx->block = ctx->program->insert_block(std::move(loop_exit));
8493 append_logical_start(ctx->block);
8494
8495 #if 0
8496 // TODO: check if it is beneficial to not branch on continues
8497 /* trim linear phis in loop header */
8498 for (auto&& instr : loop_entry->instructions) {
8499 if (instr->opcode == aco_opcode::p_linear_phi) {
8500 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
8501 new_phi->definitions[0] = instr->definitions[0];
8502 for (unsigned i = 0; i < new_phi->operands.size(); i++)
8503 new_phi->operands[i] = instr->operands[i];
8504 /* check that the remaining operands are all the same */
8505 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
8506 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
8507 instr.swap(new_phi);
8508 } else if (instr->opcode == aco_opcode::p_phi) {
8509 continue;
8510 } else {
8511 break;
8512 }
8513 }
8514 #endif
8515 }
8516
8517 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
8518 {
8519 ic->cond = cond;
8520
8521 append_logical_end(ctx->block);
8522 ctx->block->kind |= block_kind_branch;
8523
8524 /* branch to linear then block */
8525 assert(cond.regClass() == ctx->program->lane_mask);
8526 aco_ptr<Pseudo_branch_instruction> branch;
8527 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
8528 branch->operands[0] = Operand(cond);
8529 ctx->block->instructions.push_back(std::move(branch));
8530
8531 ic->BB_if_idx = ctx->block->index;
8532 ic->BB_invert = Block();
8533 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
8534 /* Invert blocks are intentionally not marked as top level because they
8535 * are not part of the logical cfg. */
8536 ic->BB_invert.kind |= block_kind_invert;
8537 ic->BB_endif = Block();
8538 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
8539 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
8540
8541 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
8542 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
8543 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
8544 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
8545 ctx->cf_info.parent_if.is_divergent = true;
8546
8547 /* divergent branches use cbranch_execz */
8548 ctx->cf_info.exec_potentially_empty_discard = false;
8549 ctx->cf_info.exec_potentially_empty_break = false;
8550 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
8551
8552 /** emit logical then block */
8553 Block* BB_then_logical = ctx->program->create_and_insert_block();
8554 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8555 add_edge(ic->BB_if_idx, BB_then_logical);
8556 ctx->block = BB_then_logical;
8557 append_logical_start(BB_then_logical);
8558 }
8559
8560 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
8561 {
8562 Block *BB_then_logical = ctx->block;
8563 append_logical_end(BB_then_logical);
8564 /* branch from logical then block to invert block */
8565 aco_ptr<Pseudo_branch_instruction> branch;
8566 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8567 BB_then_logical->instructions.emplace_back(std::move(branch));
8568 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
8569 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8570 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
8571 BB_then_logical->kind |= block_kind_uniform;
8572 assert(!ctx->cf_info.has_branch);
8573 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
8574 ctx->cf_info.parent_loop.has_divergent_branch = false;
8575
8576 /** emit linear then block */
8577 Block* BB_then_linear = ctx->program->create_and_insert_block();
8578 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8579 BB_then_linear->kind |= block_kind_uniform;
8580 add_linear_edge(ic->BB_if_idx, BB_then_linear);
8581 /* branch from linear then block to invert block */
8582 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8583 BB_then_linear->instructions.emplace_back(std::move(branch));
8584 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
8585
8586 /** emit invert merge block */
8587 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
8588 ic->invert_idx = ctx->block->index;
8589
8590 /* branch to linear else block (skip else) */
8591 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
8592 branch->operands[0] = Operand(ic->cond);
8593 ctx->block->instructions.push_back(std::move(branch));
8594
8595 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
8596 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
8597 ic->exec_potentially_empty_break_depth_old =
8598 std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
8599 /* divergent branches use cbranch_execz */
8600 ctx->cf_info.exec_potentially_empty_discard = false;
8601 ctx->cf_info.exec_potentially_empty_break = false;
8602 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
8603
8604 /** emit logical else block */
8605 Block* BB_else_logical = ctx->program->create_and_insert_block();
8606 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8607 add_logical_edge(ic->BB_if_idx, BB_else_logical);
8608 add_linear_edge(ic->invert_idx, BB_else_logical);
8609 ctx->block = BB_else_logical;
8610 append_logical_start(BB_else_logical);
8611 }
8612
8613 static void end_divergent_if(isel_context *ctx, if_context *ic)
8614 {
8615 Block *BB_else_logical = ctx->block;
8616 append_logical_end(BB_else_logical);
8617
8618 /* branch from logical else block to endif block */
8619 aco_ptr<Pseudo_branch_instruction> branch;
8620 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8621 BB_else_logical->instructions.emplace_back(std::move(branch));
8622 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
8623 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8624 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
8625 BB_else_logical->kind |= block_kind_uniform;
8626
8627 assert(!ctx->cf_info.has_branch);
8628 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
8629
8630
8631 /** emit linear else block */
8632 Block* BB_else_linear = ctx->program->create_and_insert_block();
8633 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8634 BB_else_linear->kind |= block_kind_uniform;
8635 add_linear_edge(ic->invert_idx, BB_else_linear);
8636
8637 /* branch from linear else block to endif block */
8638 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8639 BB_else_linear->instructions.emplace_back(std::move(branch));
8640 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
8641
8642
8643 /** emit endif merge block */
8644 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
8645 append_logical_start(ctx->block);
8646
8647
8648 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
8649 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
8650 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
8651 ctx->cf_info.exec_potentially_empty_break_depth =
8652 std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
8653 if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
8654 !ctx->cf_info.parent_if.is_divergent) {
8655 ctx->cf_info.exec_potentially_empty_break = false;
8656 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
8657 }
8658 /* uniform control flow never has an empty exec-mask */
8659 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
8660 ctx->cf_info.exec_potentially_empty_discard = false;
8661 ctx->cf_info.exec_potentially_empty_break = false;
8662 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
8663 }
8664 }
8665
8666 static void visit_if(isel_context *ctx, nir_if *if_stmt)
8667 {
8668 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
8669 Builder bld(ctx->program, ctx->block);
8670 aco_ptr<Pseudo_branch_instruction> branch;
8671
8672 if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
8673 /**
8674 * Uniform conditionals are represented in the following way*) :
8675 *
8676 * The linear and logical CFG:
8677 * BB_IF
8678 * / \
8679 * BB_THEN (logical) BB_ELSE (logical)
8680 * \ /
8681 * BB_ENDIF
8682 *
8683 * *) Exceptions may be due to break and continue statements within loops
8684 * If a break/continue happens within uniform control flow, it branches
8685 * to the loop exit/entry block. Otherwise, it branches to the next
8686 * merge block.
8687 **/
8688 append_logical_end(ctx->block);
8689 ctx->block->kind |= block_kind_uniform;
8690
8691 /* emit branch */
8692 assert(cond.regClass() == bld.lm);
8693 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
8694 cond = bool_to_scalar_condition(ctx, cond);
8695
8696 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
8697 branch->operands[0] = Operand(cond);
8698 branch->operands[0].setFixed(scc);
8699 ctx->block->instructions.emplace_back(std::move(branch));
8700
8701 unsigned BB_if_idx = ctx->block->index;
8702 Block BB_endif = Block();
8703 BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
8704 BB_endif.kind |= ctx->block->kind & block_kind_top_level;
8705
8706 /** emit then block */
8707 Block* BB_then = ctx->program->create_and_insert_block();
8708 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8709 add_edge(BB_if_idx, BB_then);
8710 append_logical_start(BB_then);
8711 ctx->block = BB_then;
8712 visit_cf_list(ctx, &if_stmt->then_list);
8713 BB_then = ctx->block;
8714 bool then_branch = ctx->cf_info.has_branch;
8715 bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
8716
8717 if (!then_branch) {
8718 append_logical_end(BB_then);
8719 /* branch from then block to endif block */
8720 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8721 BB_then->instructions.emplace_back(std::move(branch));
8722 add_linear_edge(BB_then->index, &BB_endif);
8723 if (!then_branch_divergent)
8724 add_logical_edge(BB_then->index, &BB_endif);
8725 BB_then->kind |= block_kind_uniform;
8726 }
8727
8728 ctx->cf_info.has_branch = false;
8729 ctx->cf_info.parent_loop.has_divergent_branch = false;
8730
8731 /** emit else block */
8732 Block* BB_else = ctx->program->create_and_insert_block();
8733 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
8734 add_edge(BB_if_idx, BB_else);
8735 append_logical_start(BB_else);
8736 ctx->block = BB_else;
8737 visit_cf_list(ctx, &if_stmt->else_list);
8738 BB_else = ctx->block;
8739
8740 if (!ctx->cf_info.has_branch) {
8741 append_logical_end(BB_else);
8742 /* branch from then block to endif block */
8743 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
8744 BB_else->instructions.emplace_back(std::move(branch));
8745 add_linear_edge(BB_else->index, &BB_endif);
8746 if (!ctx->cf_info.parent_loop.has_divergent_branch)
8747 add_logical_edge(BB_else->index, &BB_endif);
8748 BB_else->kind |= block_kind_uniform;
8749 }
8750
8751 ctx->cf_info.has_branch &= then_branch;
8752 ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
8753
8754 /** emit endif merge block */
8755 if (!ctx->cf_info.has_branch) {
8756 ctx->block = ctx->program->insert_block(std::move(BB_endif));
8757 append_logical_start(ctx->block);
8758 }
8759 } else { /* non-uniform condition */
8760 /**
8761 * To maintain a logical and linear CFG without critical edges,
8762 * non-uniform conditionals are represented in the following way*) :
8763 *
8764 * The linear CFG:
8765 * BB_IF
8766 * / \
8767 * BB_THEN (logical) BB_THEN (linear)
8768 * \ /
8769 * BB_INVERT (linear)
8770 * / \
8771 * BB_ELSE (logical) BB_ELSE (linear)
8772 * \ /
8773 * BB_ENDIF
8774 *
8775 * The logical CFG:
8776 * BB_IF
8777 * / \
8778 * BB_THEN (logical) BB_ELSE (logical)
8779 * \ /
8780 * BB_ENDIF
8781 *
8782 * *) Exceptions may be due to break and continue statements within loops
8783 **/
8784
8785 if_context ic;
8786
8787 begin_divergent_if_then(ctx, &ic, cond);
8788 visit_cf_list(ctx, &if_stmt->then_list);
8789
8790 begin_divergent_if_else(ctx, &ic);
8791 visit_cf_list(ctx, &if_stmt->else_list);
8792
8793 end_divergent_if(ctx, &ic);
8794 }
8795 }
8796
8797 static void visit_cf_list(isel_context *ctx,
8798 struct exec_list *list)
8799 {
8800 foreach_list_typed(nir_cf_node, node, node, list) {
8801 switch (node->type) {
8802 case nir_cf_node_block:
8803 visit_block(ctx, nir_cf_node_as_block(node));
8804 break;
8805 case nir_cf_node_if:
8806 visit_if(ctx, nir_cf_node_as_if(node));
8807 break;
8808 case nir_cf_node_loop:
8809 visit_loop(ctx, nir_cf_node_as_loop(node));
8810 break;
8811 default:
8812 unreachable("unimplemented cf list type");
8813 }
8814 }
8815 }
8816
8817 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
8818 {
8819 assert(ctx->stage == vertex_vs ||
8820 ctx->stage == tess_eval_vs ||
8821 ctx->stage == gs_copy_vs);
8822
8823 int offset = ctx->stage == tess_eval_vs
8824 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
8825 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
8826 uint64_t mask = ctx->outputs.mask[slot];
8827 if (!is_pos && !mask)
8828 return;
8829 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
8830 return;
8831 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
8832 exp->enabled_mask = mask;
8833 for (unsigned i = 0; i < 4; ++i) {
8834 if (mask & (1 << i))
8835 exp->operands[i] = Operand(ctx->outputs.outputs[slot][i]);
8836 else
8837 exp->operands[i] = Operand(v1);
8838 }
8839 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
8840 * Setting valid_mask=1 prevents it and has no other effect.
8841 */
8842 exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
8843 exp->done = false;
8844 exp->compressed = false;
8845 if (is_pos)
8846 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
8847 else
8848 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
8849 ctx->block->instructions.emplace_back(std::move(exp));
8850 }
8851
8852 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
8853 {
8854 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
8855 exp->enabled_mask = 0;
8856 for (unsigned i = 0; i < 4; ++i)
8857 exp->operands[i] = Operand(v1);
8858 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
8859 exp->operands[0] = Operand(ctx->outputs.outputs[VARYING_SLOT_PSIZ][0]);
8860 exp->enabled_mask |= 0x1;
8861 }
8862 if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
8863 exp->operands[2] = Operand(ctx->outputs.outputs[VARYING_SLOT_LAYER][0]);
8864 exp->enabled_mask |= 0x4;
8865 }
8866 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
8867 if (ctx->options->chip_class < GFX9) {
8868 exp->operands[3] = Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0]);
8869 exp->enabled_mask |= 0x8;
8870 } else {
8871 Builder bld(ctx->program, ctx->block);
8872
8873 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
8874 Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0]));
8875 if (exp->operands[2].isTemp())
8876 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
8877
8878 exp->operands[2] = Operand(out);
8879 exp->enabled_mask |= 0x4;
8880 }
8881 }
8882 exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
8883 exp->done = false;
8884 exp->compressed = false;
8885 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
8886 ctx->block->instructions.emplace_back(std::move(exp));
8887 }
8888
8889 static void create_vs_exports(isel_context *ctx)
8890 {
8891 assert(ctx->stage == vertex_vs ||
8892 ctx->stage == tess_eval_vs ||
8893 ctx->stage == gs_copy_vs);
8894
8895 radv_vs_output_info *outinfo = ctx->stage == tess_eval_vs
8896 ? &ctx->program->info->tes.outinfo
8897 : &ctx->program->info->vs.outinfo;
8898
8899 if (outinfo->export_prim_id) {
8900 ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
8901 ctx->outputs.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
8902 }
8903
8904 if (ctx->options->key.has_multiview_view_index) {
8905 ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
8906 ctx->outputs.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
8907 }
8908
8909 /* the order these position exports are created is important */
8910 int next_pos = 0;
8911 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
8912 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
8913 export_vs_psiz_layer_viewport(ctx, &next_pos);
8914 }
8915 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
8916 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
8917 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
8918 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
8919
8920 if (ctx->export_clip_dists) {
8921 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
8922 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
8923 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
8924 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
8925 }
8926
8927 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
8928 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
8929 i != VARYING_SLOT_PRIMITIVE_ID)
8930 continue;
8931
8932 export_vs_varying(ctx, i, false, NULL);
8933 }
8934 }
8935
8936 static void export_fs_mrt_z(isel_context *ctx)
8937 {
8938 Builder bld(ctx->program, ctx->block);
8939 unsigned enabled_channels = 0;
8940 bool compr = false;
8941 Operand values[4];
8942
8943 for (unsigned i = 0; i < 4; ++i) {
8944 values[i] = Operand(v1);
8945 }
8946
8947 /* Both stencil and sample mask only need 16-bits. */
8948 if (!ctx->program->info->ps.writes_z &&
8949 (ctx->program->info->ps.writes_stencil ||
8950 ctx->program->info->ps.writes_sample_mask)) {
8951 compr = true; /* COMPR flag */
8952
8953 if (ctx->program->info->ps.writes_stencil) {
8954 /* Stencil should be in X[23:16]. */
8955 values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]);
8956 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
8957 enabled_channels |= 0x3;
8958 }
8959
8960 if (ctx->program->info->ps.writes_sample_mask) {
8961 /* SampleMask should be in Y[15:0]. */
8962 values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]);
8963 enabled_channels |= 0xc;
8964 }
8965 } else {
8966 if (ctx->program->info->ps.writes_z) {
8967 values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_DEPTH][0]);
8968 enabled_channels |= 0x1;
8969 }
8970
8971 if (ctx->program->info->ps.writes_stencil) {
8972 values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]);
8973 enabled_channels |= 0x2;
8974 }
8975
8976 if (ctx->program->info->ps.writes_sample_mask) {
8977 values[2] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]);
8978 enabled_channels |= 0x4;
8979 }
8980 }
8981
8982 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
8983 * writemask component.
8984 */
8985 if (ctx->options->chip_class == GFX6 &&
8986 ctx->options->family != CHIP_OLAND &&
8987 ctx->options->family != CHIP_HAINAN) {
8988 enabled_channels |= 0x1;
8989 }
8990
8991 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
8992 enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
8993 }
8994
8995 static void export_fs_mrt_color(isel_context *ctx, int slot)
8996 {
8997 Builder bld(ctx->program, ctx->block);
8998 unsigned write_mask = ctx->outputs.mask[slot];
8999 Operand values[4];
9000
9001 for (unsigned i = 0; i < 4; ++i) {
9002 if (write_mask & (1 << i)) {
9003 values[i] = Operand(ctx->outputs.outputs[slot][i]);
9004 } else {
9005 values[i] = Operand(v1);
9006 }
9007 }
9008
9009 unsigned target, col_format;
9010 unsigned enabled_channels = 0;
9011 aco_opcode compr_op = (aco_opcode)0;
9012
9013 slot -= FRAG_RESULT_DATA0;
9014 target = V_008DFC_SQ_EXP_MRT + slot;
9015 col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
9016
9017 bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
9018 bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
9019
9020 switch (col_format)
9021 {
9022 case V_028714_SPI_SHADER_ZERO:
9023 enabled_channels = 0; /* writemask */
9024 target = V_008DFC_SQ_EXP_NULL;
9025 break;
9026
9027 case V_028714_SPI_SHADER_32_R:
9028 enabled_channels = 1;
9029 break;
9030
9031 case V_028714_SPI_SHADER_32_GR:
9032 enabled_channels = 0x3;
9033 break;
9034
9035 case V_028714_SPI_SHADER_32_AR:
9036 if (ctx->options->chip_class >= GFX10) {
9037 /* Special case: on GFX10, the outputs are different for 32_AR */
9038 enabled_channels = 0x3;
9039 values[1] = values[3];
9040 values[3] = Operand(v1);
9041 } else {
9042 enabled_channels = 0x9;
9043 }
9044 break;
9045
9046 case V_028714_SPI_SHADER_FP16_ABGR:
9047 enabled_channels = 0x5;
9048 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
9049 break;
9050
9051 case V_028714_SPI_SHADER_UNORM16_ABGR:
9052 enabled_channels = 0x5;
9053 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
9054 break;
9055
9056 case V_028714_SPI_SHADER_SNORM16_ABGR:
9057 enabled_channels = 0x5;
9058 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
9059 break;
9060
9061 case V_028714_SPI_SHADER_UINT16_ABGR: {
9062 enabled_channels = 0x5;
9063 compr_op = aco_opcode::v_cvt_pk_u16_u32;
9064 if (is_int8 || is_int10) {
9065 /* clamp */
9066 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
9067 Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
9068
9069 for (unsigned i = 0; i < 4; i++) {
9070 if ((write_mask >> i) & 1) {
9071 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
9072 i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
9073 values[i]);
9074 }
9075 }
9076 }
9077 break;
9078 }
9079
9080 case V_028714_SPI_SHADER_SINT16_ABGR:
9081 enabled_channels = 0x5;
9082 compr_op = aco_opcode::v_cvt_pk_i16_i32;
9083 if (is_int8 || is_int10) {
9084 /* clamp */
9085 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
9086 uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
9087 Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
9088 Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
9089
9090 for (unsigned i = 0; i < 4; i++) {
9091 if ((write_mask >> i) & 1) {
9092 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
9093 i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
9094 values[i]);
9095 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
9096 i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
9097 values[i]);
9098 }
9099 }
9100 }
9101 break;
9102
9103 case V_028714_SPI_SHADER_32_ABGR:
9104 enabled_channels = 0xF;
9105 break;
9106
9107 default:
9108 break;
9109 }
9110
9111 if (target == V_008DFC_SQ_EXP_NULL)
9112 return;
9113
9114 if ((bool) compr_op) {
9115 for (int i = 0; i < 2; i++) {
9116 /* check if at least one of the values to be compressed is enabled */
9117 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
9118 if (enabled) {
9119 enabled_channels |= enabled << (i*2);
9120 values[i] = bld.vop3(compr_op, bld.def(v1),
9121 values[i*2].isUndefined() ? Operand(0u) : values[i*2],
9122 values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
9123 } else {
9124 values[i] = Operand(v1);
9125 }
9126 }
9127 values[2] = Operand(v1);
9128 values[3] = Operand(v1);
9129 } else {
9130 for (int i = 0; i < 4; i++)
9131 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
9132 }
9133
9134 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
9135 enabled_channels, target, (bool) compr_op);
9136 }
9137
9138 static void create_fs_exports(isel_context *ctx)
9139 {
9140 /* Export depth, stencil and sample mask. */
9141 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
9142 ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
9143 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) {
9144 export_fs_mrt_z(ctx);
9145 }
9146
9147 /* Export all color render targets. */
9148 for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) {
9149 if (ctx->outputs.mask[i])
9150 export_fs_mrt_color(ctx, i);
9151 }
9152 }
9153
9154 static void write_tcs_tess_factors(isel_context *ctx)
9155 {
9156 unsigned outer_comps;
9157 unsigned inner_comps;
9158
9159 switch (ctx->args->options->key.tcs.primitive_mode) {
9160 case GL_ISOLINES:
9161 outer_comps = 2;
9162 inner_comps = 0;
9163 break;
9164 case GL_TRIANGLES:
9165 outer_comps = 3;
9166 inner_comps = 1;
9167 break;
9168 case GL_QUADS:
9169 outer_comps = 4;
9170 inner_comps = 2;
9171 break;
9172 default:
9173 return;
9174 }
9175
9176 const unsigned tess_index_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
9177 const unsigned tess_index_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
9178
9179 Builder bld(ctx->program, ctx->block);
9180
9181 bld.barrier(aco_opcode::p_memory_barrier_shared);
9182 unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
9183 if (unlikely(ctx->program->chip_class != GFX6 && workgroup_size > ctx->program->wave_size))
9184 bld.sopp(aco_opcode::s_barrier);
9185
9186 Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
9187 Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
9188
9189 Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
9190 if_context ic_invocation_id_is_zero;
9191 begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
9192 bld.reset(ctx->block);
9193
9194 Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
9195
9196 std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
9197 unsigned stride = inner_comps + outer_comps;
9198 Temp inner[4];
9199 Temp outer[4];
9200 Temp out[6];
9201 assert(inner_comps <= (sizeof(inner) / sizeof(Temp)));
9202 assert(outer_comps <= (sizeof(outer) / sizeof(Temp)));
9203 assert(stride <= (sizeof(out) / sizeof(Temp)));
9204
9205 if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
9206 // LINES reversal
9207 outer[0] = out[1] = load_lds(ctx, 4, bld.tmp(v1), lds_base.first, lds_base.second + tess_index_outer * 16 + 0 * 4, 4);
9208 outer[1] = out[0] = load_lds(ctx, 4, bld.tmp(v1), lds_base.first, lds_base.second + tess_index_outer * 16 + 1 * 4, 4);
9209 } else {
9210 for (unsigned i = 0; i < outer_comps; ++i)
9211 outer[i] = out[i] = load_lds(ctx, 4, bld.tmp(v1), lds_base.first, lds_base.second + tess_index_outer * 16 + i * 4, 4);
9212
9213 for (unsigned i = 0; i < inner_comps; ++i)
9214 inner[i] = out[outer_comps + i] = load_lds(ctx, 4, bld.tmp(v1), lds_base.first, lds_base.second + tess_index_inner * 16 + i * 4, 4);
9215 }
9216
9217 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
9218 Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
9219 Temp byte_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, stride * 4u);
9220 unsigned tf_const_offset = 0;
9221
9222 if (ctx->program->chip_class <= GFX8) {
9223 Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
9224 if_context ic_rel_patch_id_is_zero;
9225 begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
9226 bld.reset(ctx->block);
9227
9228 /* Store the dynamic HS control word. */
9229 Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
9230 bld.mubuf(aco_opcode::buffer_store_dword,
9231 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
9232 /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
9233 /* disable_wqm */ false, /* glc */ true);
9234 tf_const_offset += 4;
9235
9236 begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
9237 end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
9238 bld.reset(ctx->block);
9239 }
9240
9241 assert(stride == 2 || stride == 4 || stride == 6);
9242 Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr);
9243 store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
9244
9245 begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
9246 end_divergent_if(ctx, &ic_invocation_id_is_zero);
9247 }
9248
9249 static void emit_stream_output(isel_context *ctx,
9250 Temp const *so_buffers,
9251 Temp const *so_write_offset,
9252 const struct radv_stream_output *output)
9253 {
9254 unsigned num_comps = util_bitcount(output->component_mask);
9255 unsigned writemask = (1 << num_comps) - 1;
9256 unsigned loc = output->location;
9257 unsigned buf = output->buffer;
9258
9259 assert(num_comps && num_comps <= 4);
9260 if (!num_comps || num_comps > 4)
9261 return;
9262
9263 unsigned start = ffs(output->component_mask) - 1;
9264
9265 Temp out[4];
9266 bool all_undef = true;
9267 assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs);
9268 for (unsigned i = 0; i < num_comps; i++) {
9269 out[i] = ctx->outputs.outputs[loc][start + i];
9270 all_undef = all_undef && !out[i].id();
9271 }
9272 if (all_undef)
9273 return;
9274
9275 while (writemask) {
9276 int start, count;
9277 u_bit_scan_consecutive_range(&writemask, &start, &count);
9278 if (count == 3 && ctx->options->chip_class == GFX6) {
9279 /* GFX6 doesn't support storing vec3, split it. */
9280 writemask |= 1u << (start + 2);
9281 count = 2;
9282 }
9283
9284 unsigned offset = output->offset + start * 4;
9285
9286 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
9287 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
9288 for (int i = 0; i < count; ++i)
9289 vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
9290 vec->definitions[0] = Definition(write_data);
9291 ctx->block->instructions.emplace_back(std::move(vec));
9292
9293 aco_opcode opcode;
9294 switch (count) {
9295 case 1:
9296 opcode = aco_opcode::buffer_store_dword;
9297 break;
9298 case 2:
9299 opcode = aco_opcode::buffer_store_dwordx2;
9300 break;
9301 case 3:
9302 opcode = aco_opcode::buffer_store_dwordx3;
9303 break;
9304 case 4:
9305 opcode = aco_opcode::buffer_store_dwordx4;
9306 break;
9307 default:
9308 unreachable("Unsupported dword count.");
9309 }
9310
9311 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
9312 store->operands[0] = Operand(so_buffers[buf]);
9313 store->operands[1] = Operand(so_write_offset[buf]);
9314 store->operands[2] = Operand((uint32_t) 0);
9315 store->operands[3] = Operand(write_data);
9316 if (offset > 4095) {
9317 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
9318 Builder bld(ctx->program, ctx->block);
9319 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
9320 } else {
9321 store->offset = offset;
9322 }
9323 store->offen = true;
9324 store->glc = true;
9325 store->dlc = false;
9326 store->slc = true;
9327 store->can_reorder = true;
9328 ctx->block->instructions.emplace_back(std::move(store));
9329 }
9330 }
9331
9332 static void emit_streamout(isel_context *ctx, unsigned stream)
9333 {
9334 Builder bld(ctx->program, ctx->block);
9335
9336 Temp so_buffers[4];
9337 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
9338 for (unsigned i = 0; i < 4; i++) {
9339 unsigned stride = ctx->program->info->so.strides[i];
9340 if (!stride)
9341 continue;
9342
9343 Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
9344 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
9345 }
9346
9347 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
9348 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
9349
9350 Temp tid = emit_mbcnt(ctx, bld.def(v1));
9351
9352 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
9353
9354 if_context ic;
9355 begin_divergent_if_then(ctx, &ic, can_emit);
9356
9357 bld.reset(ctx->block);
9358
9359 Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
9360
9361 Temp so_write_offset[4];
9362
9363 for (unsigned i = 0; i < 4; i++) {
9364 unsigned stride = ctx->program->info->so.strides[i];
9365 if (!stride)
9366 continue;
9367
9368 if (stride == 1) {
9369 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
9370 get_arg(ctx, ctx->args->streamout_write_idx),
9371 get_arg(ctx, ctx->args->streamout_offset[i]));
9372 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
9373
9374 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
9375 } else {
9376 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
9377 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
9378 get_arg(ctx, ctx->args->streamout_offset[i]));
9379 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
9380 }
9381 }
9382
9383 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
9384 struct radv_stream_output *output =
9385 &ctx->program->info->so.outputs[i];
9386 if (stream != output->stream)
9387 continue;
9388
9389 emit_stream_output(ctx, so_buffers, so_write_offset, output);
9390 }
9391
9392 begin_divergent_if_else(ctx, &ic);
9393 end_divergent_if(ctx, &ic);
9394 }
9395
9396 } /* end namespace */
9397
9398 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
9399 {
9400 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
9401 Builder bld(ctx->program, ctx->block);
9402 constexpr unsigned hs_idx = 1u;
9403 Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
9404 get_arg(ctx, ctx->args->merged_wave_info),
9405 Operand((8u << 16) | (hs_idx * 8u)));
9406 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
9407
9408 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
9409
9410 Temp instance_id = bld.sop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9411 get_arg(ctx, ctx->args->ac.instance_id),
9412 get_arg(ctx, ctx->args->rel_auto_id),
9413 ls_has_nonzero_hs_threads);
9414 Temp rel_auto_id = bld.sop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9415 get_arg(ctx, ctx->args->rel_auto_id),
9416 get_arg(ctx, ctx->args->ac.tcs_rel_ids),
9417 ls_has_nonzero_hs_threads);
9418 Temp vertex_id = bld.sop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9419 get_arg(ctx, ctx->args->ac.vertex_id),
9420 get_arg(ctx, ctx->args->ac.tcs_patch_id),
9421 ls_has_nonzero_hs_threads);
9422
9423 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
9424 ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
9425 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
9426 }
9427
9428 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
9429 {
9430 /* Split all arguments except for the first (ring_offsets) and the last
9431 * (exec) so that the dead channels don't stay live throughout the program.
9432 */
9433 for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
9434 if (startpgm->definitions[i].regClass().size() > 1) {
9435 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
9436 startpgm->definitions[i].regClass().size());
9437 }
9438 }
9439 }
9440
9441 void handle_bc_optimize(isel_context *ctx)
9442 {
9443 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
9444 Builder bld(ctx->program, ctx->block);
9445 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
9446 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
9447 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
9448 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
9449 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
9450 if (uses_center && uses_centroid) {
9451 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
9452 get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
9453
9454 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
9455 Temp new_coord[2];
9456 for (unsigned i = 0; i < 2; i++) {
9457 Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
9458 Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
9459 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9460 persp_centroid, persp_center, sel);
9461 }
9462 ctx->persp_centroid = bld.tmp(v2);
9463 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
9464 Operand(new_coord[0]), Operand(new_coord[1]));
9465 emit_split_vector(ctx, ctx->persp_centroid, 2);
9466 }
9467
9468 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
9469 Temp new_coord[2];
9470 for (unsigned i = 0; i < 2; i++) {
9471 Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
9472 Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
9473 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9474 linear_centroid, linear_center, sel);
9475 }
9476 ctx->linear_centroid = bld.tmp(v2);
9477 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
9478 Operand(new_coord[0]), Operand(new_coord[1]));
9479 emit_split_vector(ctx, ctx->linear_centroid, 2);
9480 }
9481 }
9482 }
9483
9484 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
9485 {
9486 Program *program = ctx->program;
9487
9488 unsigned float_controls = shader->info.float_controls_execution_mode;
9489
9490 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
9491 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
9492 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
9493 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
9494 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
9495
9496 program->next_fp_mode.must_flush_denorms32 =
9497 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
9498 program->next_fp_mode.must_flush_denorms16_64 =
9499 float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
9500 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
9501
9502 program->next_fp_mode.care_about_round32 =
9503 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
9504
9505 program->next_fp_mode.care_about_round16_64 =
9506 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
9507 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
9508
9509 /* default to preserving fp16 and fp64 denorms, since it's free */
9510 if (program->next_fp_mode.must_flush_denorms16_64)
9511 program->next_fp_mode.denorm16_64 = 0;
9512 else
9513 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
9514
9515 /* preserving fp32 denorms is expensive, so only do it if asked */
9516 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
9517 program->next_fp_mode.denorm32 = fp_denorm_keep;
9518 else
9519 program->next_fp_mode.denorm32 = 0;
9520
9521 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
9522 program->next_fp_mode.round32 = fp_round_tz;
9523 else
9524 program->next_fp_mode.round32 = fp_round_ne;
9525
9526 if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
9527 program->next_fp_mode.round16_64 = fp_round_tz;
9528 else
9529 program->next_fp_mode.round16_64 = fp_round_ne;
9530
9531 ctx->block->fp_mode = program->next_fp_mode;
9532 }
9533
9534 void cleanup_cfg(Program *program)
9535 {
9536 /* create linear_succs/logical_succs */
9537 for (Block& BB : program->blocks) {
9538 for (unsigned idx : BB.linear_preds)
9539 program->blocks[idx].linear_succs.emplace_back(BB.index);
9540 for (unsigned idx : BB.logical_preds)
9541 program->blocks[idx].logical_succs.emplace_back(BB.index);
9542 }
9543 }
9544
9545 void select_program(Program *program,
9546 unsigned shader_count,
9547 struct nir_shader *const *shaders,
9548 ac_shader_config* config,
9549 struct radv_shader_args *args)
9550 {
9551 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
9552
9553 for (unsigned i = 0; i < shader_count; i++) {
9554 nir_shader *nir = shaders[i];
9555 init_context(&ctx, nir);
9556
9557 setup_fp_mode(&ctx, nir);
9558
9559 if (!i) {
9560 /* needs to be after init_context() for FS */
9561 Pseudo_instruction *startpgm = add_startpgm(&ctx);
9562 append_logical_start(ctx.block);
9563
9564 if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
9565 fix_ls_vgpr_init_bug(&ctx, startpgm);
9566
9567 split_arguments(&ctx, startpgm);
9568 }
9569
9570 if_context ic;
9571 if (shader_count >= 2) {
9572 Builder bld(ctx.program, ctx.block);
9573 Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u)));
9574 Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
9575 Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
9576
9577 begin_divergent_if_then(&ctx, &ic, cond);
9578 }
9579
9580 if (i) {
9581 Builder bld(ctx.program, ctx.block);
9582
9583 bld.barrier(aco_opcode::p_memory_barrier_shared);
9584 bld.sopp(aco_opcode::s_barrier);
9585
9586 if (ctx.stage == vertex_geometry_gs) {
9587 ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
9588 }
9589 } else if (ctx.stage == geometry_gs)
9590 ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
9591
9592 if (ctx.stage == fragment_fs)
9593 handle_bc_optimize(&ctx);
9594
9595 nir_function_impl *func = nir_shader_get_entrypoint(nir);
9596 visit_cf_list(&ctx, &func->body);
9597
9598 if (ctx.program->info->so.num_outputs && ctx.stage == vertex_vs)
9599 emit_streamout(&ctx, 0);
9600
9601 if (ctx.stage == vertex_vs || ctx.stage == tess_eval_vs) {
9602 create_vs_exports(&ctx);
9603 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
9604 Builder bld(ctx.program, ctx.block);
9605 bld.barrier(aco_opcode::p_memory_barrier_gs_data);
9606 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
9607 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
9608 write_tcs_tess_factors(&ctx);
9609 }
9610
9611 if (ctx.stage == fragment_fs)
9612 create_fs_exports(&ctx);
9613
9614 if (shader_count >= 2) {
9615 begin_divergent_if_else(&ctx, &ic);
9616 end_divergent_if(&ctx, &ic);
9617 }
9618
9619 ralloc_free(ctx.divergent_vals);
9620 }
9621
9622 program->config->float_mode = program->blocks[0].fp_mode.val;
9623
9624 append_logical_end(ctx.block);
9625 ctx.block->kind |= block_kind_uniform | block_kind_export_end;
9626 Builder bld(ctx.program, ctx.block);
9627 if (ctx.program->wb_smem_l1_on_end)
9628 bld.smem(aco_opcode::s_dcache_wb, false);
9629 bld.sopp(aco_opcode::s_endpgm);
9630
9631 cleanup_cfg(program);
9632 }
9633
9634 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
9635 ac_shader_config* config,
9636 struct radv_shader_args *args)
9637 {
9638 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
9639
9640 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
9641 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
9642 program->next_fp_mode.must_flush_denorms32 = false;
9643 program->next_fp_mode.must_flush_denorms16_64 = false;
9644 program->next_fp_mode.care_about_round32 = false;
9645 program->next_fp_mode.care_about_round16_64 = false;
9646 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
9647 program->next_fp_mode.denorm32 = 0;
9648 program->next_fp_mode.round32 = fp_round_ne;
9649 program->next_fp_mode.round16_64 = fp_round_ne;
9650 ctx.block->fp_mode = program->next_fp_mode;
9651
9652 add_startpgm(&ctx);
9653 append_logical_start(ctx.block);
9654
9655 Builder bld(ctx.program, ctx.block);
9656
9657 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
9658
9659 Operand stream_id(0u);
9660 if (args->shader_info->so.num_outputs)
9661 stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
9662 get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
9663
9664 Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
9665
9666 std::stack<Block> endif_blocks;
9667
9668 for (unsigned stream = 0; stream < 4; stream++) {
9669 if (stream_id.isConstant() && stream != stream_id.constantValue())
9670 continue;
9671
9672 unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
9673 if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
9674 continue;
9675
9676 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
9677
9678 unsigned BB_if_idx = ctx.block->index;
9679 Block BB_endif = Block();
9680 if (!stream_id.isConstant()) {
9681 /* begin IF */
9682 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
9683 append_logical_end(ctx.block);
9684 ctx.block->kind |= block_kind_uniform;
9685 bld.branch(aco_opcode::p_cbranch_z, cond);
9686
9687 BB_endif.kind |= ctx.block->kind & block_kind_top_level;
9688
9689 ctx.block = ctx.program->create_and_insert_block();
9690 add_edge(BB_if_idx, ctx.block);
9691 bld.reset(ctx.block);
9692 append_logical_start(ctx.block);
9693 }
9694
9695 unsigned offset = 0;
9696 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9697 if (args->shader_info->gs.output_streams[i] != stream)
9698 continue;
9699
9700 unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
9701 unsigned length = util_last_bit(output_usage_mask);
9702 for (unsigned j = 0; j < length; ++j) {
9703 if (!(output_usage_mask & (1 << j)))
9704 continue;
9705
9706 unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
9707 Temp voffset = vtx_offset;
9708 if (const_offset >= 4096u) {
9709 voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
9710 const_offset %= 4096u;
9711 }
9712
9713 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
9714 mubuf->definitions[0] = bld.def(v1);
9715 mubuf->operands[0] = Operand(gsvs_ring);
9716 mubuf->operands[1] = Operand(voffset);
9717 mubuf->operands[2] = Operand(0u);
9718 mubuf->offen = true;
9719 mubuf->offset = const_offset;
9720 mubuf->glc = true;
9721 mubuf->slc = true;
9722 mubuf->dlc = args->options->chip_class >= GFX10;
9723 mubuf->barrier = barrier_none;
9724 mubuf->can_reorder = true;
9725
9726 ctx.outputs.mask[i] |= 1 << j;
9727 ctx.outputs.outputs[i][j] = mubuf->definitions[0].getTemp();
9728
9729 bld.insert(std::move(mubuf));
9730
9731 offset++;
9732 }
9733 }
9734
9735 if (args->shader_info->so.num_outputs) {
9736 emit_streamout(&ctx, stream);
9737 bld.reset(ctx.block);
9738 }
9739
9740 if (stream == 0) {
9741 create_vs_exports(&ctx);
9742 ctx.block->kind |= block_kind_export_end;
9743 }
9744
9745 if (!stream_id.isConstant()) {
9746 append_logical_end(ctx.block);
9747
9748 /* branch from then block to endif block */
9749 bld.branch(aco_opcode::p_branch);
9750 add_edge(ctx.block->index, &BB_endif);
9751 ctx.block->kind |= block_kind_uniform;
9752
9753 /* emit else block */
9754 ctx.block = ctx.program->create_and_insert_block();
9755 add_edge(BB_if_idx, ctx.block);
9756 bld.reset(ctx.block);
9757 append_logical_start(ctx.block);
9758
9759 endif_blocks.push(std::move(BB_endif));
9760 }
9761 }
9762
9763 while (!endif_blocks.empty()) {
9764 Block BB_endif = std::move(endif_blocks.top());
9765 endif_blocks.pop();
9766
9767 Block *BB_else = ctx.block;
9768
9769 append_logical_end(BB_else);
9770 /* branch from else block to endif block */
9771 bld.branch(aco_opcode::p_branch);
9772 add_edge(BB_else->index, &BB_endif);
9773 BB_else->kind |= block_kind_uniform;
9774
9775 /** emit endif merge block */
9776 ctx.block = program->insert_block(std::move(BB_endif));
9777 bld.reset(ctx.block);
9778 append_logical_start(ctx.block);
9779 }
9780
9781 program->config->float_mode = program->blocks[0].fp_mode.val;
9782
9783 append_logical_end(ctx.block);
9784 ctx.block->kind |= block_kind_uniform;
9785 bld.sopp(aco_opcode::s_endpgm);
9786
9787 cleanup_cfg(program);
9788 }
9789 }