aco: fix a couple of value numbering issues
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
127 {
128 Builder bld(ctx->program, ctx->block);
129
130 if (!dst.id())
131 dst = bld.tmp(src.regClass());
132
133 assert(src.size() == dst.size());
134
135 if (ctx->stage != fragment_fs) {
136 if (!dst.id())
137 return src;
138
139 bld.copy(Definition(dst), src);
140 return dst;
141 }
142
143 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
144 ctx->program->needs_wqm |= program_needs_wqm;
145 return dst;
146 }
147
148 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
149 {
150 if (index.regClass() == s1)
151 return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
152
153 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
154
155 /* Currently not implemented on GFX6-7 */
156 assert(ctx->options->chip_class >= GFX8);
157
158 if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
159 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
160 }
161
162 /* GFX10, wave64 mode:
163 * The bpermute instruction is limited to half-wave operation, which means that it can't
164 * properly support subgroup shuffle like older generations (or wave32 mode), so we
165 * emulate it here.
166 */
167 if (!ctx->has_gfx10_wave64_bpermute) {
168 ctx->has_gfx10_wave64_bpermute = true;
169 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
170 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
171 }
172
173 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u));
174 lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id);
175 Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
176 Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
177 Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
178
179 return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
180 bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
181 }
182
183 Temp as_vgpr(isel_context *ctx, Temp val)
184 {
185 if (val.type() == RegType::sgpr) {
186 Builder bld(ctx->program, ctx->block);
187 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
188 }
189 assert(val.type() == RegType::vgpr);
190 return val;
191 }
192
193 //assumes a != 0xffffffff
194 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
195 {
196 assert(b != 0);
197 Builder bld(ctx->program, ctx->block);
198
199 if (util_is_power_of_two_or_zero(b)) {
200 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
201 return;
202 }
203
204 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
205
206 assert(info.multiplier <= 0xffffffff);
207
208 bool pre_shift = info.pre_shift != 0;
209 bool increment = info.increment != 0;
210 bool multiply = true;
211 bool post_shift = info.post_shift != 0;
212
213 if (!pre_shift && !increment && !multiply && !post_shift) {
214 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
215 return;
216 }
217
218 Temp pre_shift_dst = a;
219 if (pre_shift) {
220 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
221 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
222 }
223
224 Temp increment_dst = pre_shift_dst;
225 if (increment) {
226 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
227 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
228 }
229
230 Temp multiply_dst = increment_dst;
231 if (multiply) {
232 multiply_dst = post_shift ? bld.tmp(v1) : dst;
233 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
234 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
235 }
236
237 if (post_shift) {
238 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
239 }
240 }
241
242 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
243 {
244 Builder bld(ctx->program, ctx->block);
245 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
246 }
247
248
249 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
250 {
251 /* no need to extract the whole vector */
252 if (src.regClass() == dst_rc) {
253 assert(idx == 0);
254 return src;
255 }
256 assert(src.size() > idx);
257 Builder bld(ctx->program, ctx->block);
258 auto it = ctx->allocated_vec.find(src.id());
259 /* the size check needs to be early because elements other than 0 may be garbage */
260 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
261 if (it->second[idx].regClass() == dst_rc) {
262 return it->second[idx];
263 } else {
264 assert(dst_rc.size() == it->second[idx].regClass().size());
265 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
266 return bld.copy(bld.def(dst_rc), it->second[idx]);
267 }
268 }
269
270 if (src.size() == dst_rc.size()) {
271 assert(idx == 0);
272 return bld.copy(bld.def(dst_rc), src);
273 } else {
274 Temp dst = bld.tmp(dst_rc);
275 emit_extract_vector(ctx, src, idx, dst);
276 return dst;
277 }
278 }
279
280 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
281 {
282 if (num_components == 1)
283 return;
284 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
285 return;
286 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
287 split->operands[0] = Operand(vec_src);
288 std::array<Temp,4> elems;
289 for (unsigned i = 0; i < num_components; i++) {
290 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
291 split->definitions[i] = Definition(elems[i]);
292 }
293 ctx->block->instructions.emplace_back(std::move(split));
294 ctx->allocated_vec.emplace(vec_src.id(), elems);
295 }
296
297 /* This vector expansion uses a mask to determine which elements in the new vector
298 * come from the original vector. The other elements are undefined. */
299 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
300 {
301 emit_split_vector(ctx, vec_src, util_bitcount(mask));
302
303 if (vec_src == dst)
304 return;
305
306 Builder bld(ctx->program, ctx->block);
307 if (num_components == 1) {
308 if (dst.type() == RegType::sgpr)
309 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
310 else
311 bld.copy(Definition(dst), vec_src);
312 return;
313 }
314
315 unsigned component_size = dst.size() / num_components;
316 std::array<Temp,4> elems;
317
318 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
319 vec->definitions[0] = Definition(dst);
320 unsigned k = 0;
321 for (unsigned i = 0; i < num_components; i++) {
322 if (mask & (1 << i)) {
323 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
324 if (dst.type() == RegType::sgpr)
325 src = bld.as_uniform(src);
326 vec->operands[i] = Operand(src);
327 } else {
328 vec->operands[i] = Operand(0u);
329 }
330 elems[i] = vec->operands[i].getTemp();
331 }
332 ctx->block->instructions.emplace_back(std::move(vec));
333 ctx->allocated_vec.emplace(dst.id(), elems);
334 }
335
336 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
337 {
338 Builder bld(ctx->program, ctx->block);
339 if (!dst.id())
340 dst = bld.tmp(s2);
341
342 assert(val.regClass() == s1);
343 assert(dst.regClass() == s2);
344
345 return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
346 }
347
348 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
349 {
350 Builder bld(ctx->program, ctx->block);
351 if (!dst.id())
352 dst = bld.tmp(s1);
353
354 assert(val.regClass() == s2);
355 assert(dst.regClass() == s1);
356
357 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
358 Temp tmp = bld.tmp(s1);
359 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2));
360 return emit_wqm(ctx, tmp, dst);
361 }
362
363 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
364 {
365 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
366 return get_ssa_temp(ctx, src.src.ssa);
367
368 if (src.src.ssa->num_components == size) {
369 bool identity_swizzle = true;
370 for (unsigned i = 0; identity_swizzle && i < size; i++) {
371 if (src.swizzle[i] != i)
372 identity_swizzle = false;
373 }
374 if (identity_swizzle)
375 return get_ssa_temp(ctx, src.src.ssa);
376 }
377
378 Temp vec = get_ssa_temp(ctx, src.src.ssa);
379 unsigned elem_size = vec.size() / src.src.ssa->num_components;
380 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
381 assert(vec.size() % elem_size == 0);
382
383 RegClass elem_rc = RegClass(vec.type(), elem_size);
384 if (size == 1) {
385 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
386 } else {
387 assert(size <= 4);
388 std::array<Temp,4> elems;
389 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
390 for (unsigned i = 0; i < size; ++i) {
391 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
392 vec_instr->operands[i] = Operand{elems[i]};
393 }
394 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
395 vec_instr->definitions[0] = Definition(dst);
396 ctx->block->instructions.emplace_back(std::move(vec_instr));
397 ctx->allocated_vec.emplace(dst.id(), elems);
398 return dst;
399 }
400 }
401
402 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
403 {
404 if (ptr.size() == 2)
405 return ptr;
406 Builder bld(ctx->program, ctx->block);
407 if (ptr.type() == RegType::vgpr)
408 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
409 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
410 ptr, Operand((unsigned)ctx->options->address32_hi));
411 }
412
413 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
414 {
415 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
416 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
417 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
418 sop2->definitions[0] = Definition(dst);
419 if (writes_scc)
420 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
421 ctx->block->instructions.emplace_back(std::move(sop2));
422 }
423
424 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
425 {
426 Builder bld(ctx->program, ctx->block);
427 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
428 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
429 if (src1.type() == RegType::sgpr) {
430 if (commutative && src0.type() == RegType::vgpr) {
431 Temp t = src0;
432 src0 = src1;
433 src1 = t;
434 } else if (src0.type() == RegType::vgpr &&
435 op != aco_opcode::v_madmk_f32 &&
436 op != aco_opcode::v_madak_f32 &&
437 op != aco_opcode::v_madmk_f16 &&
438 op != aco_opcode::v_madak_f16) {
439 /* If the instruction is not commutative, we emit a VOP3A instruction */
440 bld.vop2_e64(op, Definition(dst), src0, src1);
441 return;
442 } else {
443 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
444 }
445 }
446 bld.vop2(op, Definition(dst), src0, src1);
447 }
448
449 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
450 {
451 Temp src0 = get_alu_src(ctx, instr->src[0]);
452 Temp src1 = get_alu_src(ctx, instr->src[1]);
453 Temp src2 = get_alu_src(ctx, instr->src[2]);
454
455 /* ensure that the instruction has at most 1 sgpr operand
456 * The optimizer will inline constants for us */
457 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
458 src0 = as_vgpr(ctx, src0);
459 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
460 src1 = as_vgpr(ctx, src1);
461 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
462 src2 = as_vgpr(ctx, src2);
463
464 Builder bld(ctx->program, ctx->block);
465 bld.vop3(op, Definition(dst), src0, src1, src2);
466 }
467
468 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
469 {
470 Builder bld(ctx->program, ctx->block);
471 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
472 }
473
474 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
475 {
476 Temp src0 = get_alu_src(ctx, instr->src[0]);
477 Temp src1 = get_alu_src(ctx, instr->src[1]);
478 aco_ptr<Instruction> vopc;
479 if (src1.type() == RegType::sgpr) {
480 if (src0.type() == RegType::vgpr) {
481 /* to swap the operands, we might also have to change the opcode */
482 switch (op) {
483 case aco_opcode::v_cmp_lt_f32:
484 op = aco_opcode::v_cmp_gt_f32;
485 break;
486 case aco_opcode::v_cmp_ge_f32:
487 op = aco_opcode::v_cmp_le_f32;
488 break;
489 case aco_opcode::v_cmp_lt_i32:
490 op = aco_opcode::v_cmp_gt_i32;
491 break;
492 case aco_opcode::v_cmp_ge_i32:
493 op = aco_opcode::v_cmp_le_i32;
494 break;
495 case aco_opcode::v_cmp_lt_u32:
496 op = aco_opcode::v_cmp_gt_u32;
497 break;
498 case aco_opcode::v_cmp_ge_u32:
499 op = aco_opcode::v_cmp_le_u32;
500 break;
501 case aco_opcode::v_cmp_lt_f64:
502 op = aco_opcode::v_cmp_gt_f64;
503 break;
504 case aco_opcode::v_cmp_ge_f64:
505 op = aco_opcode::v_cmp_le_f64;
506 break;
507 case aco_opcode::v_cmp_lt_i64:
508 op = aco_opcode::v_cmp_gt_i64;
509 break;
510 case aco_opcode::v_cmp_ge_i64:
511 op = aco_opcode::v_cmp_le_i64;
512 break;
513 case aco_opcode::v_cmp_lt_u64:
514 op = aco_opcode::v_cmp_gt_u64;
515 break;
516 case aco_opcode::v_cmp_ge_u64:
517 op = aco_opcode::v_cmp_le_u64;
518 break;
519 default: /* eq and ne are commutative */
520 break;
521 }
522 Temp t = src0;
523 src0 = src1;
524 src1 = t;
525 } else {
526 src1 = as_vgpr(ctx, src1);
527 }
528 }
529
530 Builder bld(ctx->program, ctx->block);
531 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
532 }
533
534 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
535 {
536 Temp src0 = get_alu_src(ctx, instr->src[0]);
537 Temp src1 = get_alu_src(ctx, instr->src[1]);
538
539 assert(dst.regClass() == s2);
540 assert(src0.type() == RegType::sgpr);
541 assert(src1.type() == RegType::sgpr);
542
543 Builder bld(ctx->program, ctx->block);
544 /* Emit the SALU comparison instruction */
545 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
546 /* Turn the result into a per-lane bool */
547 bool_to_vector_condition(ctx, cmp, dst);
548 }
549
550 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
551 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
552 {
553 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
554 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
555 bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
556 bool use_valu = s_op == aco_opcode::last_opcode ||
557 divergent_vals ||
558 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
559 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
560 aco_opcode op = use_valu ? v_op : s_op;
561 assert(op != aco_opcode::last_opcode);
562
563 if (use_valu)
564 emit_vopc_instruction(ctx, instr, op, dst);
565 else
566 emit_sopc_instruction(ctx, instr, op, dst);
567 }
568
569 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
570 {
571 Builder bld(ctx->program, ctx->block);
572 Temp src0 = get_alu_src(ctx, instr->src[0]);
573 Temp src1 = get_alu_src(ctx, instr->src[1]);
574
575 assert(dst.regClass() == s2);
576 assert(src0.regClass() == s2);
577 assert(src1.regClass() == s2);
578
579 bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1);
580 }
581
582 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
583 {
584 Builder bld(ctx->program, ctx->block);
585 Temp cond = get_alu_src(ctx, instr->src[0]);
586 Temp then = get_alu_src(ctx, instr->src[1]);
587 Temp els = get_alu_src(ctx, instr->src[2]);
588
589 assert(cond.regClass() == s2);
590
591 if (dst.type() == RegType::vgpr) {
592 aco_ptr<Instruction> bcsel;
593 if (dst.size() == 1) {
594 then = as_vgpr(ctx, then);
595 els = as_vgpr(ctx, els);
596
597 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
598 } else if (dst.size() == 2) {
599 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
600 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
601 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
602 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
603
604 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
605 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
606
607 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
608 } else {
609 fprintf(stderr, "Unimplemented NIR instr bit size: ");
610 nir_print_instr(&instr->instr, stderr);
611 fprintf(stderr, "\n");
612 }
613 return;
614 }
615
616 if (instr->dest.dest.ssa.bit_size == 1) {
617 assert(dst.regClass() == s2);
618 assert(then.regClass() == s2);
619 assert(els.regClass() == s2);
620 }
621
622 if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
623 if (dst.regClass() == s1 || dst.regClass() == s2) {
624 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
625 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
626 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
627 } else {
628 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
629 nir_print_instr(&instr->instr, stderr);
630 fprintf(stderr, "\n");
631 }
632 return;
633 }
634
635 /* divergent boolean bcsel
636 * this implements bcsel on bools: dst = s0 ? s1 : s2
637 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
638 assert(instr->dest.dest.ssa.bit_size == 1);
639
640 if (cond.id() != then.id())
641 then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
642
643 if (cond.id() == els.id())
644 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
645 else
646 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
647 bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
648 }
649
650 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
651 aco_opcode op, uint32_t undo)
652 {
653 /* multiply by 16777216 to handle denormals */
654 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)),
655 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
656 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
657 scaled = bld.vop1(op, bld.def(v1), scaled);
658 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
659
660 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
661
662 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
663 }
664
665 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
666 {
667 if (ctx->block->fp_mode.denorm32 == 0) {
668 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
669 return;
670 }
671
672 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
673 }
674
675 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
676 {
677 if (ctx->block->fp_mode.denorm32 == 0) {
678 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
679 return;
680 }
681
682 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
683 }
684
685 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
686 {
687 if (ctx->block->fp_mode.denorm32 == 0) {
688 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
689 return;
690 }
691
692 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
693 }
694
695 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
696 {
697 if (ctx->block->fp_mode.denorm32 == 0) {
698 bld.vop1(aco_opcode::v_log_f32, dst, val);
699 return;
700 }
701
702 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
703 }
704
705 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
706 {
707 if (!instr->dest.dest.is_ssa) {
708 fprintf(stderr, "nir alu dst not in ssa: ");
709 nir_print_instr(&instr->instr, stderr);
710 fprintf(stderr, "\n");
711 abort();
712 }
713 Builder bld(ctx->program, ctx->block);
714 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
715 switch(instr->op) {
716 case nir_op_vec2:
717 case nir_op_vec3:
718 case nir_op_vec4: {
719 std::array<Temp,4> elems;
720 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
721 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
722 elems[i] = get_alu_src(ctx, instr->src[i]);
723 vec->operands[i] = Operand{elems[i]};
724 }
725 vec->definitions[0] = Definition(dst);
726 ctx->block->instructions.emplace_back(std::move(vec));
727 ctx->allocated_vec.emplace(dst.id(), elems);
728 break;
729 }
730 case nir_op_mov: {
731 Temp src = get_alu_src(ctx, instr->src[0]);
732 aco_ptr<Instruction> mov;
733 if (dst.type() == RegType::sgpr) {
734 if (src.type() == RegType::vgpr)
735 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
736 else if (src.regClass() == s1)
737 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
738 else if (src.regClass() == s2)
739 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
740 else
741 unreachable("wrong src register class for nir_op_imov");
742 } else if (dst.regClass() == v1) {
743 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
744 } else if (dst.regClass() == v2) {
745 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
746 } else {
747 nir_print_instr(&instr->instr, stderr);
748 unreachable("Should have been lowered to scalar.");
749 }
750 break;
751 }
752 case nir_op_inot: {
753 Temp src = get_alu_src(ctx, instr->src[0]);
754 if (instr->dest.dest.ssa.bit_size == 1) {
755 assert(src.regClass() == s2);
756 assert(dst.regClass() == s2);
757 bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src);
758 } else if (dst.regClass() == v1) {
759 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
760 } else if (dst.type() == RegType::sgpr) {
761 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
762 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
763 } else {
764 fprintf(stderr, "Unimplemented NIR instr bit size: ");
765 nir_print_instr(&instr->instr, stderr);
766 fprintf(stderr, "\n");
767 }
768 break;
769 }
770 case nir_op_ineg: {
771 Temp src = get_alu_src(ctx, instr->src[0]);
772 if (dst.regClass() == v1) {
773 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
774 } else if (dst.regClass() == s1) {
775 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
776 } else if (dst.size() == 2) {
777 Temp src0 = bld.tmp(dst.type(), 1);
778 Temp src1 = bld.tmp(dst.type(), 1);
779 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
780
781 if (dst.regClass() == s2) {
782 Temp carry = bld.tmp(s1);
783 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
784 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
785 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
786 } else {
787 Temp lower = bld.tmp(v1);
788 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
789 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
790 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
791 }
792 } else {
793 fprintf(stderr, "Unimplemented NIR instr bit size: ");
794 nir_print_instr(&instr->instr, stderr);
795 fprintf(stderr, "\n");
796 }
797 break;
798 }
799 case nir_op_iabs: {
800 if (dst.regClass() == s1) {
801 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
802 } else if (dst.regClass() == v1) {
803 Temp src = get_alu_src(ctx, instr->src[0]);
804 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
805 } else {
806 fprintf(stderr, "Unimplemented NIR instr bit size: ");
807 nir_print_instr(&instr->instr, stderr);
808 fprintf(stderr, "\n");
809 }
810 break;
811 }
812 case nir_op_isign: {
813 Temp src = get_alu_src(ctx, instr->src[0]);
814 if (dst.regClass() == s1) {
815 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
816 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
817 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
818 } else if (dst.regClass() == s2) {
819 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
820 Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
821 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
822 } else if (dst.regClass() == v1) {
823 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
824 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
825 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
826 } else if (dst.regClass() == v2) {
827 Temp upper = emit_extract_vector(ctx, src, 1, v1);
828 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
829 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
830 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
831 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
832 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
833 } else {
834 fprintf(stderr, "Unimplemented NIR instr bit size: ");
835 nir_print_instr(&instr->instr, stderr);
836 fprintf(stderr, "\n");
837 }
838 break;
839 }
840 case nir_op_imax: {
841 if (dst.regClass() == v1) {
842 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
843 } else if (dst.regClass() == s1) {
844 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
845 } else {
846 fprintf(stderr, "Unimplemented NIR instr bit size: ");
847 nir_print_instr(&instr->instr, stderr);
848 fprintf(stderr, "\n");
849 }
850 break;
851 }
852 case nir_op_umax: {
853 if (dst.regClass() == v1) {
854 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
855 } else if (dst.regClass() == s1) {
856 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
857 } else {
858 fprintf(stderr, "Unimplemented NIR instr bit size: ");
859 nir_print_instr(&instr->instr, stderr);
860 fprintf(stderr, "\n");
861 }
862 break;
863 }
864 case nir_op_imin: {
865 if (dst.regClass() == v1) {
866 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
867 } else if (dst.regClass() == s1) {
868 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
869 } else {
870 fprintf(stderr, "Unimplemented NIR instr bit size: ");
871 nir_print_instr(&instr->instr, stderr);
872 fprintf(stderr, "\n");
873 }
874 break;
875 }
876 case nir_op_umin: {
877 if (dst.regClass() == v1) {
878 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
879 } else if (dst.regClass() == s1) {
880 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
881 } else {
882 fprintf(stderr, "Unimplemented NIR instr bit size: ");
883 nir_print_instr(&instr->instr, stderr);
884 fprintf(stderr, "\n");
885 }
886 break;
887 }
888 case nir_op_ior: {
889 if (instr->dest.dest.ssa.bit_size == 1) {
890 emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
891 } else if (dst.regClass() == v1) {
892 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
893 } else if (dst.regClass() == s1) {
894 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
895 } else if (dst.regClass() == s2) {
896 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
897 } else {
898 fprintf(stderr, "Unimplemented NIR instr bit size: ");
899 nir_print_instr(&instr->instr, stderr);
900 fprintf(stderr, "\n");
901 }
902 break;
903 }
904 case nir_op_iand: {
905 if (instr->dest.dest.ssa.bit_size == 1) {
906 emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
907 } else if (dst.regClass() == v1) {
908 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
909 } else if (dst.regClass() == s1) {
910 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
911 } else if (dst.regClass() == s2) {
912 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
913 } else {
914 fprintf(stderr, "Unimplemented NIR instr bit size: ");
915 nir_print_instr(&instr->instr, stderr);
916 fprintf(stderr, "\n");
917 }
918 break;
919 }
920 case nir_op_ixor: {
921 if (instr->dest.dest.ssa.bit_size == 1) {
922 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
923 } else if (dst.regClass() == v1) {
924 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
925 } else if (dst.regClass() == s1) {
926 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
927 } else if (dst.regClass() == s2) {
928 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
929 } else {
930 fprintf(stderr, "Unimplemented NIR instr bit size: ");
931 nir_print_instr(&instr->instr, stderr);
932 fprintf(stderr, "\n");
933 }
934 break;
935 }
936 case nir_op_ushr: {
937 if (dst.regClass() == v1) {
938 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
939 } else if (dst.regClass() == v2) {
940 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
941 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
942 } else if (dst.regClass() == s2) {
943 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
944 } else if (dst.regClass() == s1) {
945 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
946 } else {
947 fprintf(stderr, "Unimplemented NIR instr bit size: ");
948 nir_print_instr(&instr->instr, stderr);
949 fprintf(stderr, "\n");
950 }
951 break;
952 }
953 case nir_op_ishl: {
954 if (dst.regClass() == v1) {
955 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
956 } else if (dst.regClass() == v2) {
957 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
958 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
959 } else if (dst.regClass() == s1) {
960 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
961 } else if (dst.regClass() == s2) {
962 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
963 } else {
964 fprintf(stderr, "Unimplemented NIR instr bit size: ");
965 nir_print_instr(&instr->instr, stderr);
966 fprintf(stderr, "\n");
967 }
968 break;
969 }
970 case nir_op_ishr: {
971 if (dst.regClass() == v1) {
972 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
973 } else if (dst.regClass() == v2) {
974 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
975 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
976 } else if (dst.regClass() == s1) {
977 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
978 } else if (dst.regClass() == s2) {
979 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
980 } else {
981 fprintf(stderr, "Unimplemented NIR instr bit size: ");
982 nir_print_instr(&instr->instr, stderr);
983 fprintf(stderr, "\n");
984 }
985 break;
986 }
987 case nir_op_find_lsb: {
988 Temp src = get_alu_src(ctx, instr->src[0]);
989 if (src.regClass() == s1) {
990 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
991 } else if (src.regClass() == v1) {
992 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
993 } else if (src.regClass() == s2) {
994 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
995 } else {
996 fprintf(stderr, "Unimplemented NIR instr bit size: ");
997 nir_print_instr(&instr->instr, stderr);
998 fprintf(stderr, "\n");
999 }
1000 break;
1001 }
1002 case nir_op_ufind_msb:
1003 case nir_op_ifind_msb: {
1004 Temp src = get_alu_src(ctx, instr->src[0]);
1005 if (src.regClass() == s1 || src.regClass() == s2) {
1006 aco_opcode op = src.regClass() == s2 ?
1007 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1008 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1009 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1010
1011 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1012 Operand(src.size() * 32u - 1u), msb_rev);
1013 Temp msb = sub.def(0).getTemp();
1014 Temp carry = sub.def(1).getTemp();
1015
1016 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
1017 } else if (src.regClass() == v1) {
1018 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1019 Temp msb_rev = bld.tmp(v1);
1020 emit_vop1_instruction(ctx, instr, op, msb_rev);
1021 Temp msb = bld.tmp(v1);
1022 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1023 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1024 } else {
1025 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1026 nir_print_instr(&instr->instr, stderr);
1027 fprintf(stderr, "\n");
1028 }
1029 break;
1030 }
1031 case nir_op_bitfield_reverse: {
1032 if (dst.regClass() == s1) {
1033 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1034 } else if (dst.regClass() == v1) {
1035 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1036 } else {
1037 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1038 nir_print_instr(&instr->instr, stderr);
1039 fprintf(stderr, "\n");
1040 }
1041 break;
1042 }
1043 case nir_op_iadd: {
1044 if (dst.regClass() == s1) {
1045 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1046 break;
1047 }
1048
1049 Temp src0 = get_alu_src(ctx, instr->src[0]);
1050 Temp src1 = get_alu_src(ctx, instr->src[1]);
1051 if (dst.regClass() == v1) {
1052 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1053 break;
1054 }
1055
1056 assert(src0.size() == 2 && src1.size() == 2);
1057 Temp src00 = bld.tmp(src0.type(), 1);
1058 Temp src01 = bld.tmp(dst.type(), 1);
1059 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1060 Temp src10 = bld.tmp(src1.type(), 1);
1061 Temp src11 = bld.tmp(dst.type(), 1);
1062 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1063
1064 if (dst.regClass() == s2) {
1065 Temp carry = bld.tmp(s1);
1066 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1067 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1068 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1069 } else if (dst.regClass() == v2) {
1070 Temp dst0 = bld.tmp(v1);
1071 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1072 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1073 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1074 } else {
1075 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1076 nir_print_instr(&instr->instr, stderr);
1077 fprintf(stderr, "\n");
1078 }
1079 break;
1080 }
1081 case nir_op_uadd_sat: {
1082 Temp src0 = get_alu_src(ctx, instr->src[0]);
1083 Temp src1 = get_alu_src(ctx, instr->src[1]);
1084 if (dst.regClass() == s1) {
1085 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1086 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1087 src0, src1);
1088 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1089 } else if (dst.regClass() == v1) {
1090 if (ctx->options->chip_class >= GFX9) {
1091 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1092 add->operands[0] = Operand(src0);
1093 add->operands[1] = Operand(src1);
1094 add->definitions[0] = Definition(dst);
1095 add->clamp = 1;
1096 ctx->block->instructions.emplace_back(std::move(add));
1097 } else {
1098 if (src1.regClass() != v1)
1099 std::swap(src0, src1);
1100 assert(src1.regClass() == v1);
1101 Temp tmp = bld.tmp(v1);
1102 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1103 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1104 }
1105 } else {
1106 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1107 nir_print_instr(&instr->instr, stderr);
1108 fprintf(stderr, "\n");
1109 }
1110 break;
1111 }
1112 case nir_op_uadd_carry: {
1113 Temp src0 = get_alu_src(ctx, instr->src[0]);
1114 Temp src1 = get_alu_src(ctx, instr->src[1]);
1115 if (dst.regClass() == s1) {
1116 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1117 break;
1118 }
1119 if (dst.regClass() == v1) {
1120 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1121 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1122 break;
1123 }
1124
1125 Temp src00 = bld.tmp(src0.type(), 1);
1126 Temp src01 = bld.tmp(dst.type(), 1);
1127 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1128 Temp src10 = bld.tmp(src1.type(), 1);
1129 Temp src11 = bld.tmp(dst.type(), 1);
1130 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1131 if (dst.regClass() == s2) {
1132 Temp carry = bld.tmp(s1);
1133 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1134 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1135 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1136 } else if (dst.regClass() == v2) {
1137 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1138 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1139 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1140 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1141 } else {
1142 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1143 nir_print_instr(&instr->instr, stderr);
1144 fprintf(stderr, "\n");
1145 }
1146 break;
1147 }
1148 case nir_op_isub: {
1149 if (dst.regClass() == s1) {
1150 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1151 break;
1152 }
1153
1154 Temp src0 = get_alu_src(ctx, instr->src[0]);
1155 Temp src1 = get_alu_src(ctx, instr->src[1]);
1156 if (dst.regClass() == v1) {
1157 bld.vsub32(Definition(dst), src0, src1);
1158 break;
1159 }
1160
1161 Temp src00 = bld.tmp(src0.type(), 1);
1162 Temp src01 = bld.tmp(dst.type(), 1);
1163 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1164 Temp src10 = bld.tmp(src1.type(), 1);
1165 Temp src11 = bld.tmp(dst.type(), 1);
1166 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1167 if (dst.regClass() == s2) {
1168 Temp carry = bld.tmp(s1);
1169 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1170 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1171 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1172 } else if (dst.regClass() == v2) {
1173 Temp lower = bld.tmp(v1);
1174 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1175 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1176 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1177 } else {
1178 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1179 nir_print_instr(&instr->instr, stderr);
1180 fprintf(stderr, "\n");
1181 }
1182 break;
1183 }
1184 case nir_op_usub_borrow: {
1185 Temp src0 = get_alu_src(ctx, instr->src[0]);
1186 Temp src1 = get_alu_src(ctx, instr->src[1]);
1187 if (dst.regClass() == s1) {
1188 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1189 break;
1190 } else if (dst.regClass() == v1) {
1191 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1192 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1193 break;
1194 }
1195
1196 Temp src00 = bld.tmp(src0.type(), 1);
1197 Temp src01 = bld.tmp(dst.type(), 1);
1198 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1199 Temp src10 = bld.tmp(src1.type(), 1);
1200 Temp src11 = bld.tmp(dst.type(), 1);
1201 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1202 if (dst.regClass() == s2) {
1203 Temp borrow = bld.tmp(s1);
1204 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1205 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1206 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1207 } else if (dst.regClass() == v2) {
1208 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1209 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1210 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1211 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1212 } else {
1213 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1214 nir_print_instr(&instr->instr, stderr);
1215 fprintf(stderr, "\n");
1216 }
1217 break;
1218 }
1219 case nir_op_imul: {
1220 if (dst.regClass() == v1) {
1221 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1222 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1223 } else if (dst.regClass() == s1) {
1224 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1225 } else {
1226 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227 nir_print_instr(&instr->instr, stderr);
1228 fprintf(stderr, "\n");
1229 }
1230 break;
1231 }
1232 case nir_op_umul_high: {
1233 if (dst.regClass() == v1) {
1234 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1235 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1236 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1237 } else if (dst.regClass() == s1) {
1238 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1239 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1240 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1241 } else {
1242 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1243 nir_print_instr(&instr->instr, stderr);
1244 fprintf(stderr, "\n");
1245 }
1246 break;
1247 }
1248 case nir_op_imul_high: {
1249 if (dst.regClass() == v1) {
1250 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1251 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1252 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1253 } else if (dst.regClass() == s1) {
1254 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1255 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1256 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1257 } else {
1258 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1259 nir_print_instr(&instr->instr, stderr);
1260 fprintf(stderr, "\n");
1261 }
1262 break;
1263 }
1264 case nir_op_fmul: {
1265 if (dst.size() == 1) {
1266 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1267 } else if (dst.size() == 2) {
1268 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1269 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1270 } else {
1271 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1272 nir_print_instr(&instr->instr, stderr);
1273 fprintf(stderr, "\n");
1274 }
1275 break;
1276 }
1277 case nir_op_fadd: {
1278 if (dst.size() == 1) {
1279 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1280 } else if (dst.size() == 2) {
1281 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1282 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_fsub: {
1291 Temp src0 = get_alu_src(ctx, instr->src[0]);
1292 Temp src1 = get_alu_src(ctx, instr->src[1]);
1293 if (dst.size() == 1) {
1294 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1295 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1296 else
1297 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1298 } else if (dst.size() == 2) {
1299 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1300 get_alu_src(ctx, instr->src[0]),
1301 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1302 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1303 sub->neg[1] = true;
1304 } else {
1305 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1306 nir_print_instr(&instr->instr, stderr);
1307 fprintf(stderr, "\n");
1308 }
1309 break;
1310 }
1311 case nir_op_fmax: {
1312 if (dst.size() == 1) {
1313 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1314 } else if (dst.size() == 2) {
1315 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1316 get_alu_src(ctx, instr->src[0]),
1317 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1318 } else {
1319 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1320 nir_print_instr(&instr->instr, stderr);
1321 fprintf(stderr, "\n");
1322 }
1323 break;
1324 }
1325 case nir_op_fmin: {
1326 if (dst.size() == 1) {
1327 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1328 } else if (dst.size() == 2) {
1329 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1330 get_alu_src(ctx, instr->src[0]),
1331 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1332 } else {
1333 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1334 nir_print_instr(&instr->instr, stderr);
1335 fprintf(stderr, "\n");
1336 }
1337 break;
1338 }
1339 case nir_op_fmax3: {
1340 if (dst.size() == 1) {
1341 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1342 } else {
1343 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1344 nir_print_instr(&instr->instr, stderr);
1345 fprintf(stderr, "\n");
1346 }
1347 break;
1348 }
1349 case nir_op_fmin3: {
1350 if (dst.size() == 1) {
1351 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1352 } else {
1353 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1354 nir_print_instr(&instr->instr, stderr);
1355 fprintf(stderr, "\n");
1356 }
1357 break;
1358 }
1359 case nir_op_fmed3: {
1360 if (dst.size() == 1) {
1361 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1362 } else {
1363 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1364 nir_print_instr(&instr->instr, stderr);
1365 fprintf(stderr, "\n");
1366 }
1367 break;
1368 }
1369 case nir_op_umax3: {
1370 if (dst.size() == 1) {
1371 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1372 } else {
1373 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1374 nir_print_instr(&instr->instr, stderr);
1375 fprintf(stderr, "\n");
1376 }
1377 break;
1378 }
1379 case nir_op_umin3: {
1380 if (dst.size() == 1) {
1381 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1382 } else {
1383 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1384 nir_print_instr(&instr->instr, stderr);
1385 fprintf(stderr, "\n");
1386 }
1387 break;
1388 }
1389 case nir_op_umed3: {
1390 if (dst.size() == 1) {
1391 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1392 } else {
1393 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1394 nir_print_instr(&instr->instr, stderr);
1395 fprintf(stderr, "\n");
1396 }
1397 break;
1398 }
1399 case nir_op_imax3: {
1400 if (dst.size() == 1) {
1401 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1402 } else {
1403 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1404 nir_print_instr(&instr->instr, stderr);
1405 fprintf(stderr, "\n");
1406 }
1407 break;
1408 }
1409 case nir_op_imin3: {
1410 if (dst.size() == 1) {
1411 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1412 } else {
1413 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414 nir_print_instr(&instr->instr, stderr);
1415 fprintf(stderr, "\n");
1416 }
1417 break;
1418 }
1419 case nir_op_imed3: {
1420 if (dst.size() == 1) {
1421 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1422 } else {
1423 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1424 nir_print_instr(&instr->instr, stderr);
1425 fprintf(stderr, "\n");
1426 }
1427 break;
1428 }
1429 case nir_op_cube_face_coord: {
1430 Temp in = get_alu_src(ctx, instr->src[0], 3);
1431 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1432 emit_extract_vector(ctx, in, 1, v1),
1433 emit_extract_vector(ctx, in, 2, v1) };
1434 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1435 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1436 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1437 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1438 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1439 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1440 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1441 break;
1442 }
1443 case nir_op_cube_face_index: {
1444 Temp in = get_alu_src(ctx, instr->src[0], 3);
1445 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1446 emit_extract_vector(ctx, in, 1, v1),
1447 emit_extract_vector(ctx, in, 2, v1) };
1448 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1449 break;
1450 }
1451 case nir_op_bcsel: {
1452 emit_bcsel(ctx, instr, dst);
1453 break;
1454 }
1455 case nir_op_frsq: {
1456 if (dst.size() == 1) {
1457 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1458 } else if (dst.size() == 2) {
1459 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1460 } else {
1461 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1462 nir_print_instr(&instr->instr, stderr);
1463 fprintf(stderr, "\n");
1464 }
1465 break;
1466 }
1467 case nir_op_fneg: {
1468 Temp src = get_alu_src(ctx, instr->src[0]);
1469 if (dst.size() == 1) {
1470 if (ctx->block->fp_mode.must_flush_denorms32)
1471 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1472 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1473 } else if (dst.size() == 2) {
1474 if (ctx->block->fp_mode.must_flush_denorms16_64)
1475 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1476 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1477 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1478 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1479 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1480 } else {
1481 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1482 nir_print_instr(&instr->instr, stderr);
1483 fprintf(stderr, "\n");
1484 }
1485 break;
1486 }
1487 case nir_op_fabs: {
1488 Temp src = get_alu_src(ctx, instr->src[0]);
1489 if (dst.size() == 1) {
1490 if (ctx->block->fp_mode.must_flush_denorms32)
1491 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1492 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1493 } else if (dst.size() == 2) {
1494 if (ctx->block->fp_mode.must_flush_denorms16_64)
1495 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1496 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1497 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1498 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1499 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1500 } else {
1501 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1502 nir_print_instr(&instr->instr, stderr);
1503 fprintf(stderr, "\n");
1504 }
1505 break;
1506 }
1507 case nir_op_fsat: {
1508 Temp src = get_alu_src(ctx, instr->src[0]);
1509 if (dst.size() == 1) {
1510 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1511 } else if (dst.size() == 2) {
1512 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1513 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1514 vop3->clamp = true;
1515 } else {
1516 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1517 nir_print_instr(&instr->instr, stderr);
1518 fprintf(stderr, "\n");
1519 }
1520 break;
1521 }
1522 case nir_op_flog2: {
1523 if (dst.size() == 1) {
1524 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1525 } else {
1526 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1527 nir_print_instr(&instr->instr, stderr);
1528 fprintf(stderr, "\n");
1529 }
1530 break;
1531 }
1532 case nir_op_frcp: {
1533 if (dst.size() == 1) {
1534 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1535 } else if (dst.size() == 2) {
1536 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1537 } else {
1538 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1539 nir_print_instr(&instr->instr, stderr);
1540 fprintf(stderr, "\n");
1541 }
1542 break;
1543 }
1544 case nir_op_fexp2: {
1545 if (dst.size() == 1) {
1546 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1547 } else {
1548 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1549 nir_print_instr(&instr->instr, stderr);
1550 fprintf(stderr, "\n");
1551 }
1552 break;
1553 }
1554 case nir_op_fsqrt: {
1555 if (dst.size() == 1) {
1556 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1557 } else if (dst.size() == 2) {
1558 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1559 } else {
1560 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1561 nir_print_instr(&instr->instr, stderr);
1562 fprintf(stderr, "\n");
1563 }
1564 break;
1565 }
1566 case nir_op_ffract: {
1567 if (dst.size() == 1) {
1568 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1569 } else if (dst.size() == 2) {
1570 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1571 } else {
1572 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1573 nir_print_instr(&instr->instr, stderr);
1574 fprintf(stderr, "\n");
1575 }
1576 break;
1577 }
1578 case nir_op_ffloor: {
1579 if (dst.size() == 1) {
1580 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1581 } else if (dst.size() == 2) {
1582 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1583 } else {
1584 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1585 nir_print_instr(&instr->instr, stderr);
1586 fprintf(stderr, "\n");
1587 }
1588 break;
1589 }
1590 case nir_op_fceil: {
1591 if (dst.size() == 1) {
1592 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1593 } else if (dst.size() == 2) {
1594 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1595 } else {
1596 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1597 nir_print_instr(&instr->instr, stderr);
1598 fprintf(stderr, "\n");
1599 }
1600 break;
1601 }
1602 case nir_op_ftrunc: {
1603 if (dst.size() == 1) {
1604 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1605 } else if (dst.size() == 2) {
1606 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1607 } else {
1608 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1609 nir_print_instr(&instr->instr, stderr);
1610 fprintf(stderr, "\n");
1611 }
1612 break;
1613 }
1614 case nir_op_fround_even: {
1615 if (dst.size() == 1) {
1616 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1617 } else if (dst.size() == 2) {
1618 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1619 } else {
1620 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1621 nir_print_instr(&instr->instr, stderr);
1622 fprintf(stderr, "\n");
1623 }
1624 break;
1625 }
1626 case nir_op_fsin:
1627 case nir_op_fcos: {
1628 Temp src = get_alu_src(ctx, instr->src[0]);
1629 aco_ptr<Instruction> norm;
1630 if (dst.size() == 1) {
1631 Temp tmp;
1632 Operand half_pi(0x3e22f983u);
1633 if (src.type() == RegType::sgpr)
1634 tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1635 else
1636 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1637
1638 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1639 if (ctx->options->chip_class < GFX9)
1640 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1641
1642 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1643 bld.vop1(opcode, Definition(dst), tmp);
1644 } else {
1645 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1646 nir_print_instr(&instr->instr, stderr);
1647 fprintf(stderr, "\n");
1648 }
1649 break;
1650 }
1651 case nir_op_ldexp: {
1652 if (dst.size() == 1) {
1653 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1654 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1655 get_alu_src(ctx, instr->src[1]));
1656 } else if (dst.size() == 2) {
1657 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1658 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1659 get_alu_src(ctx, instr->src[1]));
1660 } else {
1661 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1662 nir_print_instr(&instr->instr, stderr);
1663 fprintf(stderr, "\n");
1664 }
1665 break;
1666 }
1667 case nir_op_frexp_sig: {
1668 if (dst.size() == 1) {
1669 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1670 get_alu_src(ctx, instr->src[0]));
1671 } else if (dst.size() == 2) {
1672 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1673 get_alu_src(ctx, instr->src[0]));
1674 } else {
1675 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1676 nir_print_instr(&instr->instr, stderr);
1677 fprintf(stderr, "\n");
1678 }
1679 break;
1680 }
1681 case nir_op_frexp_exp: {
1682 if (instr->src[0].src.ssa->bit_size == 32) {
1683 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1684 get_alu_src(ctx, instr->src[0]));
1685 } else if (instr->src[0].src.ssa->bit_size == 64) {
1686 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1687 get_alu_src(ctx, instr->src[0]));
1688 } else {
1689 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1690 nir_print_instr(&instr->instr, stderr);
1691 fprintf(stderr, "\n");
1692 }
1693 break;
1694 }
1695 case nir_op_fsign: {
1696 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1697 if (dst.size() == 1) {
1698 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1699 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1700 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1701 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1702 } else if (dst.size() == 2) {
1703 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1704 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1705 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1706
1707 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1708 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1709 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1710
1711 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1712 } else {
1713 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1714 nir_print_instr(&instr->instr, stderr);
1715 fprintf(stderr, "\n");
1716 }
1717 break;
1718 }
1719 case nir_op_f2f32: {
1720 if (instr->src[0].src.ssa->bit_size == 64) {
1721 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1722 } else {
1723 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1724 nir_print_instr(&instr->instr, stderr);
1725 fprintf(stderr, "\n");
1726 }
1727 break;
1728 }
1729 case nir_op_f2f64: {
1730 if (instr->src[0].src.ssa->bit_size == 32) {
1731 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1732 } else {
1733 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1734 nir_print_instr(&instr->instr, stderr);
1735 fprintf(stderr, "\n");
1736 }
1737 break;
1738 }
1739 case nir_op_i2f32: {
1740 assert(dst.size() == 1);
1741 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1742 break;
1743 }
1744 case nir_op_i2f64: {
1745 if (instr->src[0].src.ssa->bit_size == 32) {
1746 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1747 } else if (instr->src[0].src.ssa->bit_size == 64) {
1748 Temp src = get_alu_src(ctx, instr->src[0]);
1749 RegClass rc = RegClass(src.type(), 1);
1750 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1751 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1752 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1753 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1754 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1755 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1756
1757 } else {
1758 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1759 nir_print_instr(&instr->instr, stderr);
1760 fprintf(stderr, "\n");
1761 }
1762 break;
1763 }
1764 case nir_op_u2f32: {
1765 assert(dst.size() == 1);
1766 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1767 break;
1768 }
1769 case nir_op_u2f64: {
1770 if (instr->src[0].src.ssa->bit_size == 32) {
1771 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1772 } else if (instr->src[0].src.ssa->bit_size == 64) {
1773 Temp src = get_alu_src(ctx, instr->src[0]);
1774 RegClass rc = RegClass(src.type(), 1);
1775 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1776 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1777 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1778 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1779 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1780 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1781 } else {
1782 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1783 nir_print_instr(&instr->instr, stderr);
1784 fprintf(stderr, "\n");
1785 }
1786 break;
1787 }
1788 case nir_op_f2i32: {
1789 Temp src = get_alu_src(ctx, instr->src[0]);
1790 if (instr->src[0].src.ssa->bit_size == 32) {
1791 if (dst.type() == RegType::vgpr)
1792 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1793 else
1794 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1795 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1796
1797 } else if (instr->src[0].src.ssa->bit_size == 64) {
1798 if (dst.type() == RegType::vgpr)
1799 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1800 else
1801 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1802 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1803
1804 } else {
1805 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1806 nir_print_instr(&instr->instr, stderr);
1807 fprintf(stderr, "\n");
1808 }
1809 break;
1810 }
1811 case nir_op_f2u32: {
1812 Temp src = get_alu_src(ctx, instr->src[0]);
1813 if (instr->src[0].src.ssa->bit_size == 32) {
1814 if (dst.type() == RegType::vgpr)
1815 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1816 else
1817 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1818 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1819
1820 } else if (instr->src[0].src.ssa->bit_size == 64) {
1821 if (dst.type() == RegType::vgpr)
1822 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1823 else
1824 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1825 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1826
1827 } else {
1828 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1829 nir_print_instr(&instr->instr, stderr);
1830 fprintf(stderr, "\n");
1831 }
1832 break;
1833 }
1834 case nir_op_f2i64: {
1835 Temp src = get_alu_src(ctx, instr->src[0]);
1836 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1837 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1838 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1839 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1840 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1841 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1842 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1843 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1844 Temp new_exponent = bld.tmp(v1);
1845 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1846 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1847 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1848 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1849 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1850 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1851 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1852 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1853 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1854 Temp new_lower = bld.tmp(v1);
1855 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1856 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1857 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1858
1859 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1860 if (src.type() == RegType::vgpr)
1861 src = bld.as_uniform(src);
1862 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1863 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1864 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1865 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1866 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1867 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1868 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1869 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1870 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1871 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1872 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1873 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1874 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1875 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1876 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1877 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1878 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1879 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1880 Temp borrow = bld.tmp(s1);
1881 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1882 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1883 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1884
1885 } else if (instr->src[0].src.ssa->bit_size == 64) {
1886 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1887 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1888 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1889 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1890 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1891 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1892 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1893 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1894 if (dst.type() == RegType::sgpr) {
1895 lower = bld.as_uniform(lower);
1896 upper = bld.as_uniform(upper);
1897 }
1898 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1899
1900 } else {
1901 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1902 nir_print_instr(&instr->instr, stderr);
1903 fprintf(stderr, "\n");
1904 }
1905 break;
1906 }
1907 case nir_op_f2u64: {
1908 Temp src = get_alu_src(ctx, instr->src[0]);
1909 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1910 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1911 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1912 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1913 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1914 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1915 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1916 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1917 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1918 Temp new_exponent = bld.tmp(v1);
1919 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1920 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1921 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1922 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1923 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1924 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1925 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1926 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1927 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1928
1929 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1930 if (src.type() == RegType::vgpr)
1931 src = bld.as_uniform(src);
1932 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1933 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1934 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1935 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1936 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1937 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1938 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1939 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1940 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1941 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1942 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1943 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1944 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1945 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1946 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1947 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1948 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1949 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1950
1951 } else if (instr->src[0].src.ssa->bit_size == 64) {
1952 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1953 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1954 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1955 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1956 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1957 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1958 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1959 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1960 if (dst.type() == RegType::sgpr) {
1961 lower = bld.as_uniform(lower);
1962 upper = bld.as_uniform(upper);
1963 }
1964 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1965
1966 } else {
1967 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1968 nir_print_instr(&instr->instr, stderr);
1969 fprintf(stderr, "\n");
1970 }
1971 break;
1972 }
1973 case nir_op_b2f32: {
1974 Temp src = get_alu_src(ctx, instr->src[0]);
1975 assert(src.regClass() == s2);
1976
1977 if (dst.regClass() == s1) {
1978 src = bool_to_scalar_condition(ctx, src);
1979 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1980 } else if (dst.regClass() == v1) {
1981 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1982 } else {
1983 unreachable("Wrong destination register class for nir_op_b2f32.");
1984 }
1985 break;
1986 }
1987 case nir_op_b2f64: {
1988 Temp src = get_alu_src(ctx, instr->src[0]);
1989 assert(src.regClass() == s2);
1990
1991 if (dst.regClass() == s2) {
1992 src = bool_to_scalar_condition(ctx, src);
1993 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1994 } else if (dst.regClass() == v2) {
1995 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1996 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
1997 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1998 } else {
1999 unreachable("Wrong destination register class for nir_op_b2f64.");
2000 }
2001 break;
2002 }
2003 case nir_op_i2i32: {
2004 Temp src = get_alu_src(ctx, instr->src[0]);
2005 if (instr->src[0].src.ssa->bit_size == 64) {
2006 /* we can actually just say dst = src, as it would map the lower register */
2007 emit_extract_vector(ctx, src, 0, dst);
2008 } else {
2009 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2010 nir_print_instr(&instr->instr, stderr);
2011 fprintf(stderr, "\n");
2012 }
2013 break;
2014 }
2015 case nir_op_u2u32: {
2016 Temp src = get_alu_src(ctx, instr->src[0]);
2017 if (instr->src[0].src.ssa->bit_size == 16) {
2018 if (dst.regClass() == s1) {
2019 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2020 } else {
2021 // TODO: do better with SDWA
2022 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2023 }
2024 } else if (instr->src[0].src.ssa->bit_size == 64) {
2025 /* we can actually just say dst = src, as it would map the lower register */
2026 emit_extract_vector(ctx, src, 0, dst);
2027 } else {
2028 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2029 nir_print_instr(&instr->instr, stderr);
2030 fprintf(stderr, "\n");
2031 }
2032 break;
2033 }
2034 case nir_op_i2i64: {
2035 Temp src = get_alu_src(ctx, instr->src[0]);
2036 if (src.regClass() == s1) {
2037 Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
2038 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2039 } else if (src.regClass() == v1) {
2040 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2041 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2042 } else {
2043 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2044 nir_print_instr(&instr->instr, stderr);
2045 fprintf(stderr, "\n");
2046 }
2047 break;
2048 }
2049 case nir_op_u2u64: {
2050 Temp src = get_alu_src(ctx, instr->src[0]);
2051 if (instr->src[0].src.ssa->bit_size == 32) {
2052 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2053 } else {
2054 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2055 nir_print_instr(&instr->instr, stderr);
2056 fprintf(stderr, "\n");
2057 }
2058 break;
2059 }
2060 case nir_op_b2i32: {
2061 Temp src = get_alu_src(ctx, instr->src[0]);
2062 assert(src.regClass() == s2);
2063
2064 if (dst.regClass() == s1) {
2065 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2066 bool_to_scalar_condition(ctx, src, dst);
2067 } else if (dst.regClass() == v1) {
2068 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2069 } else {
2070 unreachable("Invalid register class for b2i32");
2071 }
2072 break;
2073 }
2074 case nir_op_i2b1: {
2075 Temp src = get_alu_src(ctx, instr->src[0]);
2076 assert(dst.regClass() == s2);
2077
2078 if (src.type() == RegType::vgpr) {
2079 assert(src.regClass() == v1 || src.regClass() == v2);
2080 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2081 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2082 } else {
2083 assert(src.regClass() == s1 || src.regClass() == s2);
2084 Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2085 bld.scc(bld.def(s1)), Operand(0u), src);
2086 bool_to_vector_condition(ctx, tmp, dst);
2087 }
2088 break;
2089 }
2090 case nir_op_pack_64_2x32_split: {
2091 Temp src0 = get_alu_src(ctx, instr->src[0]);
2092 Temp src1 = get_alu_src(ctx, instr->src[1]);
2093
2094 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2095 break;
2096 }
2097 case nir_op_unpack_64_2x32_split_x:
2098 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2099 break;
2100 case nir_op_unpack_64_2x32_split_y:
2101 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2102 break;
2103 case nir_op_pack_half_2x16: {
2104 Temp src = get_alu_src(ctx, instr->src[0], 2);
2105
2106 if (dst.regClass() == v1) {
2107 Temp src0 = bld.tmp(v1);
2108 Temp src1 = bld.tmp(v1);
2109 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2110 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2111 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2112 else
2113 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2114 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2115 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2116 } else {
2117 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2118 nir_print_instr(&instr->instr, stderr);
2119 fprintf(stderr, "\n");
2120 }
2121 break;
2122 }
2123 case nir_op_unpack_half_2x16_split_x: {
2124 if (dst.regClass() == v1) {
2125 Builder bld(ctx->program, ctx->block);
2126 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2127 } else {
2128 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2129 nir_print_instr(&instr->instr, stderr);
2130 fprintf(stderr, "\n");
2131 }
2132 break;
2133 }
2134 case nir_op_unpack_half_2x16_split_y: {
2135 if (dst.regClass() == v1) {
2136 Builder bld(ctx->program, ctx->block);
2137 /* TODO: use SDWA here */
2138 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2139 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2140 } else {
2141 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2142 nir_print_instr(&instr->instr, stderr);
2143 fprintf(stderr, "\n");
2144 }
2145 break;
2146 }
2147 case nir_op_fquantize2f16: {
2148 Temp src = get_alu_src(ctx, instr->src[0]);
2149 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2150
2151 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2152
2153 Temp cmp_res = bld.tmp(s2);
2154 bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2155
2156 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2157
2158 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
2159 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2160 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2161 } else {
2162 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2163 }
2164 break;
2165 }
2166 case nir_op_bfm: {
2167 Temp bits = get_alu_src(ctx, instr->src[0]);
2168 Temp offset = get_alu_src(ctx, instr->src[1]);
2169
2170 if (dst.regClass() == s1) {
2171 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2172 } else if (dst.regClass() == v1) {
2173 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2174 } else {
2175 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2176 nir_print_instr(&instr->instr, stderr);
2177 fprintf(stderr, "\n");
2178 }
2179 break;
2180 }
2181 case nir_op_bitfield_select: {
2182 /* (mask & insert) | (~mask & base) */
2183 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2184 Temp insert = get_alu_src(ctx, instr->src[1]);
2185 Temp base = get_alu_src(ctx, instr->src[2]);
2186
2187 /* dst = (insert & bitmask) | (base & ~bitmask) */
2188 if (dst.regClass() == s1) {
2189 aco_ptr<Instruction> sop2;
2190 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2191 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2192 Operand lhs;
2193 if (const_insert && const_bitmask) {
2194 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2195 } else {
2196 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2197 lhs = Operand(insert);
2198 }
2199
2200 Operand rhs;
2201 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2202 if (const_base && const_bitmask) {
2203 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2204 } else {
2205 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2206 rhs = Operand(base);
2207 }
2208
2209 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2210
2211 } else if (dst.regClass() == v1) {
2212 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2213 base = as_vgpr(ctx, base);
2214 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2215 insert = as_vgpr(ctx, insert);
2216
2217 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2218
2219 } else {
2220 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2221 nir_print_instr(&instr->instr, stderr);
2222 fprintf(stderr, "\n");
2223 }
2224 break;
2225 }
2226 case nir_op_ubfe:
2227 case nir_op_ibfe: {
2228 Temp base = get_alu_src(ctx, instr->src[0]);
2229 Temp offset = get_alu_src(ctx, instr->src[1]);
2230 Temp bits = get_alu_src(ctx, instr->src[2]);
2231
2232 if (dst.type() == RegType::sgpr) {
2233 Operand extract;
2234 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2235 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2236 if (const_offset && const_bits) {
2237 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2238 extract = Operand(const_extract);
2239 } else {
2240 Operand width;
2241 if (const_bits) {
2242 width = Operand(const_bits->u32 << 16);
2243 } else {
2244 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2245 }
2246 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2247 }
2248
2249 aco_opcode opcode;
2250 if (dst.regClass() == s1) {
2251 if (instr->op == nir_op_ubfe)
2252 opcode = aco_opcode::s_bfe_u32;
2253 else
2254 opcode = aco_opcode::s_bfe_i32;
2255 } else if (dst.regClass() == s2) {
2256 if (instr->op == nir_op_ubfe)
2257 opcode = aco_opcode::s_bfe_u64;
2258 else
2259 opcode = aco_opcode::s_bfe_i64;
2260 } else {
2261 unreachable("Unsupported BFE bit size");
2262 }
2263
2264 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2265
2266 } else {
2267 aco_opcode opcode;
2268 if (dst.regClass() == v1) {
2269 if (instr->op == nir_op_ubfe)
2270 opcode = aco_opcode::v_bfe_u32;
2271 else
2272 opcode = aco_opcode::v_bfe_i32;
2273 } else {
2274 unreachable("Unsupported BFE bit size");
2275 }
2276
2277 emit_vop3a_instruction(ctx, instr, opcode, dst);
2278 }
2279 break;
2280 }
2281 case nir_op_bit_count: {
2282 Temp src = get_alu_src(ctx, instr->src[0]);
2283 if (src.regClass() == s1) {
2284 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2285 } else if (src.regClass() == v1) {
2286 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2287 } else if (src.regClass() == v2) {
2288 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2289 emit_extract_vector(ctx, src, 1, v1),
2290 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2291 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2292 } else if (src.regClass() == s2) {
2293 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2294 } else {
2295 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2296 nir_print_instr(&instr->instr, stderr);
2297 fprintf(stderr, "\n");
2298 }
2299 break;
2300 }
2301 case nir_op_flt: {
2302 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2303 break;
2304 }
2305 case nir_op_fge: {
2306 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2307 break;
2308 }
2309 case nir_op_feq: {
2310 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2311 break;
2312 }
2313 case nir_op_fne: {
2314 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2315 break;
2316 }
2317 case nir_op_ilt: {
2318 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2319 break;
2320 }
2321 case nir_op_ige: {
2322 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2323 break;
2324 }
2325 case nir_op_ieq: {
2326 if (instr->src[0].src.ssa->bit_size == 1)
2327 emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b32, aco_opcode::s_xnor_b64, dst);
2328 else
2329 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
2330 break;
2331 }
2332 case nir_op_ine: {
2333 if (instr->src[0].src.ssa->bit_size == 1)
2334 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
2335 else
2336 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
2337 break;
2338 }
2339 case nir_op_ult: {
2340 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2341 break;
2342 }
2343 case nir_op_uge: {
2344 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2345 break;
2346 }
2347 case nir_op_fddx:
2348 case nir_op_fddy:
2349 case nir_op_fddx_fine:
2350 case nir_op_fddy_fine:
2351 case nir_op_fddx_coarse:
2352 case nir_op_fddy_coarse: {
2353 Definition tl = bld.def(v1);
2354 uint16_t dpp_ctrl;
2355 if (instr->op == nir_op_fddx_fine) {
2356 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2357 dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2358 } else if (instr->op == nir_op_fddy_fine) {
2359 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2360 dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2361 } else {
2362 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2363 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2364 dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2365 else
2366 dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2367 }
2368
2369 Definition tmp = bld.def(v1);
2370 bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2371 emit_wqm(ctx, tmp.getTemp(), dst, true);
2372 break;
2373 }
2374 default:
2375 fprintf(stderr, "Unknown NIR ALU instr: ");
2376 nir_print_instr(&instr->instr, stderr);
2377 fprintf(stderr, "\n");
2378 }
2379 }
2380
2381 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2382 {
2383 Temp dst = get_ssa_temp(ctx, &instr->def);
2384
2385 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2386 // which get truncated the lsb if double and msb if int
2387 // for now, we only use s_mov_b64 with 64bit inline constants
2388 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2389 assert(dst.type() == RegType::sgpr);
2390
2391 Builder bld(ctx->program, ctx->block);
2392
2393 if (instr->def.bit_size == 1) {
2394 assert(dst.regClass() == s2);
2395 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0)));
2396 } else if (dst.size() == 1) {
2397 bld.copy(Definition(dst), Operand(instr->value[0].u32));
2398 } else {
2399 assert(dst.size() != 1);
2400 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2401 if (instr->def.bit_size == 64)
2402 for (unsigned i = 0; i < dst.size(); i++)
2403 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2404 else {
2405 for (unsigned i = 0; i < dst.size(); i++)
2406 vec->operands[i] = Operand{instr->value[i].u32};
2407 }
2408 vec->definitions[0] = Definition(dst);
2409 ctx->block->instructions.emplace_back(std::move(vec));
2410 }
2411 }
2412
2413 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2414 {
2415 uint32_t new_mask = 0;
2416 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2417 if (mask & (1u << i))
2418 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2419 return new_mask;
2420 }
2421
2422 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2423 {
2424 /* This wouldn't work inside control flow or with indirect offsets but
2425 * that doesn't happen because of nir_lower_io_to_temporaries(). */
2426
2427 unsigned write_mask = nir_intrinsic_write_mask(instr);
2428 unsigned component = nir_intrinsic_component(instr);
2429 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2430 unsigned idx = nir_intrinsic_base(instr) + component;
2431
2432 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2433 if (off_instr->type != nir_instr_type_load_const) {
2434 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2435 nir_print_instr(off_instr, stderr);
2436 fprintf(stderr, "\n");
2437 }
2438 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2439
2440 if (instr->src[0].ssa->bit_size == 64)
2441 write_mask = widen_mask(write_mask, 2);
2442
2443 for (unsigned i = 0; i < 8; ++i) {
2444 if (write_mask & (1 << i)) {
2445 ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2446 ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2447 }
2448 idx++;
2449 }
2450 }
2451
2452 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2453 {
2454 unsigned write_mask = nir_intrinsic_write_mask(instr);
2455 Operand values[4];
2456 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2457 for (unsigned i = 0; i < 4; ++i) {
2458 if (write_mask & (1 << i)) {
2459 Temp tmp = emit_extract_vector(ctx, src, i, v1);
2460 values[i] = Operand(tmp);
2461 } else {
2462 values[i] = Operand(v1);
2463 }
2464 }
2465
2466 unsigned index = nir_intrinsic_base(instr) / 4;
2467 unsigned target, col_format;
2468 unsigned enabled_channels = 0xF;
2469 aco_opcode compr_op = (aco_opcode)0;
2470
2471 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2472 assert(offset && "Non-const offsets on exports not yet supported");
2473 index += offset->u32;
2474
2475 assert(index != FRAG_RESULT_COLOR);
2476
2477 /* Unlike vertex shader exports, it's fine to use multiple exports to
2478 * export separate channels of one target. So shaders which export both
2479 * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2480 * TODO: combine the exports in those cases and create better code
2481 */
2482
2483 if (index == FRAG_RESULT_SAMPLE_MASK) {
2484
2485 if (ctx->program->info->ps.writes_z) {
2486 target = V_008DFC_SQ_EXP_MRTZ;
2487 enabled_channels = 0x4;
2488 col_format = (unsigned) -1;
2489
2490 values[2] = values[0];
2491 values[0] = Operand(v1);
2492 } else {
2493 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2494 exp->valid_mask = false;
2495 exp->done = false;
2496 exp->compressed = true;
2497 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2498 exp->enabled_mask = 0xc;
2499 for (int i = 0; i < 4; i++)
2500 exp->operands[i] = Operand(v1);
2501 exp->operands[1] = Operand(values[0]);
2502 ctx->block->instructions.emplace_back(std::move(exp));
2503 return;
2504 }
2505
2506 } else if (index == FRAG_RESULT_DEPTH) {
2507
2508 target = V_008DFC_SQ_EXP_MRTZ;
2509 enabled_channels = 0x1;
2510 col_format = (unsigned) -1;
2511
2512 } else if (index == FRAG_RESULT_STENCIL) {
2513
2514 if (ctx->program->info->ps.writes_z) {
2515 target = V_008DFC_SQ_EXP_MRTZ;
2516 enabled_channels = 0x2;
2517 col_format = (unsigned) -1;
2518
2519 values[1] = values[0];
2520 values[0] = Operand(v1);
2521 } else {
2522 aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2523 shift->operands[0] = Operand((uint32_t) 16);
2524 shift->operands[1] = values[0];
2525 Temp tmp = {ctx->program->allocateId(), v1};
2526 shift->definitions[0] = Definition(tmp);
2527 ctx->block->instructions.emplace_back(std::move(shift));
2528
2529 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2530 exp->valid_mask = false;
2531 exp->done = false;
2532 exp->compressed = true;
2533 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2534 exp->enabled_mask = 0x3;
2535 exp->operands[0] = Operand(tmp);
2536 for (int i = 1; i < 4; i++)
2537 exp->operands[i] = Operand(v1);
2538 ctx->block->instructions.emplace_back(std::move(exp));
2539 return;
2540 }
2541
2542 } else {
2543 index -= FRAG_RESULT_DATA0;
2544 target = V_008DFC_SQ_EXP_MRT + index;
2545 col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2546 }
2547 ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2548 ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2549 assert(!is_int8 && !is_int10);
2550
2551 switch (col_format)
2552 {
2553 case V_028714_SPI_SHADER_ZERO:
2554 enabled_channels = 0; /* writemask */
2555 target = V_008DFC_SQ_EXP_NULL;
2556 break;
2557
2558 case V_028714_SPI_SHADER_32_R:
2559 enabled_channels = 1;
2560 break;
2561
2562 case V_028714_SPI_SHADER_32_GR:
2563 enabled_channels = 0x3;
2564 break;
2565
2566 case V_028714_SPI_SHADER_32_AR:
2567 if (ctx->options->chip_class >= GFX10) {
2568 /* Special case: on GFX10, the outputs are different for 32_AR */
2569 enabled_channels = 0x3;
2570 values[1] = values[3];
2571 } else {
2572 enabled_channels = 0x9;
2573 }
2574 break;
2575
2576 case V_028714_SPI_SHADER_FP16_ABGR:
2577 enabled_channels = 0x5;
2578 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2579 break;
2580
2581 case V_028714_SPI_SHADER_UNORM16_ABGR:
2582 enabled_channels = 0x5;
2583 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2584 break;
2585
2586 case V_028714_SPI_SHADER_SNORM16_ABGR:
2587 enabled_channels = 0x5;
2588 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2589 break;
2590
2591 case V_028714_SPI_SHADER_UINT16_ABGR:
2592 enabled_channels = 0x5;
2593 compr_op = aco_opcode::v_cvt_pk_u16_u32;
2594 break;
2595
2596 case V_028714_SPI_SHADER_SINT16_ABGR:
2597 enabled_channels = 0x5;
2598 compr_op = aco_opcode::v_cvt_pk_i16_i32;
2599 break;
2600
2601 case V_028714_SPI_SHADER_32_ABGR:
2602 enabled_channels = 0xF;
2603 break;
2604
2605 default:
2606 break;
2607 }
2608
2609 if (target == V_008DFC_SQ_EXP_NULL)
2610 return;
2611
2612 if ((bool)compr_op)
2613 {
2614 for (int i = 0; i < 2; i++)
2615 {
2616 /* check if at least one of the values to be compressed is enabled */
2617 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2618 if (enabled) {
2619 enabled_channels |= enabled << (i*2);
2620 aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2621 Temp tmp{ctx->program->allocateId(), v1};
2622 compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2623 compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2624 compr->definitions[0] = Definition(tmp);
2625 values[i] = Operand(tmp);
2626 ctx->block->instructions.emplace_back(std::move(compr));
2627 } else {
2628 values[i] = Operand(v1);
2629 }
2630 }
2631 }
2632
2633 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2634 exp->valid_mask = false;
2635 exp->done = false;
2636 exp->compressed = (bool) compr_op;
2637 exp->dest = target;
2638 exp->enabled_mask = enabled_channels;
2639 if ((bool) compr_op) {
2640 for (int i = 0; i < 2; i++)
2641 exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2642 exp->operands[2] = Operand(v1);
2643 exp->operands[3] = Operand(v1);
2644 } else {
2645 for (int i = 0; i < 4; i++)
2646 exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2647 }
2648
2649 ctx->block->instructions.emplace_back(std::move(exp));
2650 }
2651
2652 Operand load_lds_size_m0(isel_context *ctx)
2653 {
2654 /* TODO: m0 does not need to be initialized on GFX9+ */
2655 Builder bld(ctx->program, ctx->block);
2656 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2657 }
2658
2659 void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2660 Temp address, unsigned base_offset, unsigned align)
2661 {
2662 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2663
2664 Builder bld(ctx->program, ctx->block);
2665
2666 Operand m = load_lds_size_m0(ctx);
2667
2668 unsigned num_components = dst.size() * 4u / elem_size_bytes;
2669 unsigned bytes_read = 0;
2670 unsigned result_size = 0;
2671 unsigned total_bytes = num_components * elem_size_bytes;
2672 std::array<Temp, 4> result;
2673
2674 while (bytes_read < total_bytes) {
2675 unsigned todo = total_bytes - bytes_read;
2676 bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2677 bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2678
2679 aco_opcode op = aco_opcode::last_opcode;
2680 bool read2 = false;
2681 if (todo >= 16 && aligned16) {
2682 op = aco_opcode::ds_read_b128;
2683 todo = 16;
2684 } else if (todo >= 16 && aligned8) {
2685 op = aco_opcode::ds_read2_b64;
2686 read2 = true;
2687 todo = 16;
2688 } else if (todo >= 12 && aligned16) {
2689 op = aco_opcode::ds_read_b96;
2690 todo = 12;
2691 } else if (todo >= 8 && aligned8) {
2692 op = aco_opcode::ds_read_b64;
2693 todo = 8;
2694 } else if (todo >= 8) {
2695 op = aco_opcode::ds_read2_b32;
2696 read2 = true;
2697 todo = 8;
2698 } else if (todo >= 4) {
2699 op = aco_opcode::ds_read_b32;
2700 todo = 4;
2701 } else {
2702 assert(false);
2703 }
2704 assert(todo % elem_size_bytes == 0);
2705 unsigned num_elements = todo / elem_size_bytes;
2706 unsigned offset = base_offset + bytes_read;
2707 unsigned max_offset = read2 ? 1019 : 65535;
2708
2709 Temp address_offset = address;
2710 if (offset > max_offset) {
2711 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2712 offset = bytes_read;
2713 }
2714 assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2715
2716 Temp res;
2717 if (num_components == 1 && dst.type() == RegType::vgpr)
2718 res = dst;
2719 else
2720 res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2721
2722 if (read2)
2723 res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
2724 else
2725 res = bld.ds(op, Definition(res), address_offset, m, offset);
2726
2727 if (num_components == 1) {
2728 assert(todo == total_bytes);
2729 if (dst.type() == RegType::sgpr)
2730 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2731 return;
2732 }
2733
2734 if (dst.type() == RegType::sgpr)
2735 res = bld.as_uniform(res);
2736
2737 if (num_elements == 1) {
2738 result[result_size++] = res;
2739 } else {
2740 assert(res != dst && res.size() % num_elements == 0);
2741 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2742 split->operands[0] = Operand(res);
2743 for (unsigned i = 0; i < num_elements; i++)
2744 split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2745 ctx->block->instructions.emplace_back(std::move(split));
2746 }
2747
2748 bytes_read += todo;
2749 }
2750
2751 assert(result_size == num_components && result_size > 1);
2752 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2753 for (unsigned i = 0; i < result_size; i++)
2754 vec->operands[i] = Operand(result[i]);
2755 vec->definitions[0] = Definition(dst);
2756 ctx->block->instructions.emplace_back(std::move(vec));
2757 ctx->allocated_vec.emplace(dst.id(), result);
2758 }
2759
2760 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2761 {
2762 if (start == 0 && size == data.size())
2763 return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2764
2765 unsigned size_hint = 1;
2766 auto it = ctx->allocated_vec.find(data.id());
2767 if (it != ctx->allocated_vec.end())
2768 size_hint = it->second[0].size();
2769 if (size % size_hint || start % size_hint)
2770 size_hint = 1;
2771
2772 start /= size_hint;
2773 size /= size_hint;
2774
2775 Temp elems[size];
2776 for (unsigned i = 0; i < size; i++)
2777 elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2778
2779 if (size == 1)
2780 return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2781
2782 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2783 for (unsigned i = 0; i < size; i++)
2784 vec->operands[i] = Operand(elems[i]);
2785 Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2786 vec->definitions[0] = Definition(res);
2787 ctx->block->instructions.emplace_back(std::move(vec));
2788 return res;
2789 }
2790
2791 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2792 {
2793 Builder bld(ctx->program, ctx->block);
2794 unsigned bytes_written = 0;
2795 while (bytes_written < total_size * 4) {
2796 unsigned todo = total_size * 4 - bytes_written;
2797 bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2798 bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2799
2800 aco_opcode op = aco_opcode::last_opcode;
2801 bool write2 = false;
2802 unsigned size = 0;
2803 if (todo >= 16 && aligned16) {
2804 op = aco_opcode::ds_write_b128;
2805 size = 4;
2806 } else if (todo >= 16 && aligned8) {
2807 op = aco_opcode::ds_write2_b64;
2808 write2 = true;
2809 size = 4;
2810 } else if (todo >= 12 && aligned16) {
2811 op = aco_opcode::ds_write_b96;
2812 size = 3;
2813 } else if (todo >= 8 && aligned8) {
2814 op = aco_opcode::ds_write_b64;
2815 size = 2;
2816 } else if (todo >= 8) {
2817 op = aco_opcode::ds_write2_b32;
2818 write2 = true;
2819 size = 2;
2820 } else if (todo >= 4) {
2821 op = aco_opcode::ds_write_b32;
2822 size = 1;
2823 } else {
2824 assert(false);
2825 }
2826
2827 unsigned offset = offset0 + offset1 + bytes_written;
2828 unsigned max_offset = write2 ? 1020 : 65535;
2829 Temp address_offset = address;
2830 if (offset > max_offset) {
2831 address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2832 offset = offset1 + bytes_written;
2833 }
2834 assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2835
2836 if (write2) {
2837 Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2838 Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2839 bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
2840 } else {
2841 Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2842 bld.ds(op, address_offset, val, m, offset);
2843 }
2844
2845 bytes_written += size * 4;
2846 }
2847 }
2848
2849 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2850 Temp address, unsigned base_offset, unsigned align)
2851 {
2852 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2853
2854 Operand m = load_lds_size_m0(ctx);
2855
2856 /* we need at most two stores for 32bit variables */
2857 int start[2], count[2];
2858 u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2859 u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2860 assert(wrmask == 0);
2861
2862 /* one combined store is sufficient */
2863 if (count[0] == count[1]) {
2864 Builder bld(ctx->program, ctx->block);
2865
2866 Temp address_offset = address;
2867 if ((base_offset >> 2) + start[1] > 255) {
2868 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2869 base_offset = 0;
2870 }
2871
2872 assert(count[0] == 1);
2873 Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
2874 Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
2875 aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2876 base_offset = base_offset / elem_size_bytes;
2877 bld.ds(op, address_offset, val0, val1, m,
2878 base_offset + start[0], base_offset + start[1]);
2879 return;
2880 }
2881
2882 for (unsigned i = 0; i < 2; i++) {
2883 if (count[i] == 0)
2884 continue;
2885
2886 unsigned elem_size_words = elem_size_bytes / 4;
2887 ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2888 base_offset, start[i] * elem_size_bytes, align);
2889 }
2890 return;
2891 }
2892
2893 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2894 {
2895 if (ctx->stage == vertex_vs) {
2896 visit_store_vs_output(ctx, instr);
2897 } else if (ctx->stage == fragment_fs) {
2898 visit_store_fs_output(ctx, instr);
2899 } else {
2900 unreachable("Shader stage not implemented");
2901 }
2902 }
2903
2904 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2905 {
2906 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2907 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2908
2909 Builder bld(ctx->program, ctx->block);
2910 Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2911 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2912 }
2913
2914 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2915 {
2916 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2917 for (unsigned i = 0; i < num_components; i++)
2918 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
2919 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
2920 assert(num_components == 4);
2921 Builder bld(ctx->program, ctx->block);
2922 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
2923 }
2924
2925 for (Operand& op : vec->operands)
2926 op = op.isUndefined() ? Operand(0u) : op;
2927
2928 vec->definitions[0] = Definition(dst);
2929 ctx->block->instructions.emplace_back(std::move(vec));
2930 emit_split_vector(ctx, dst, num_components);
2931 return;
2932 }
2933
2934 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2935 {
2936 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2937 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2938 unsigned idx = nir_intrinsic_base(instr);
2939 unsigned component = nir_intrinsic_component(instr);
2940 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
2941
2942 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2943 if (offset) {
2944 assert(offset->u32 == 0);
2945 } else {
2946 /* the lower 15bit of the prim_mask contain the offset into LDS
2947 * while the upper bits contain the number of prims */
2948 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2949 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2950 Builder bld(ctx->program, ctx->block);
2951 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2952 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2953 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2954 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2955 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2956 }
2957
2958 if (instr->dest.ssa.num_components == 1) {
2959 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2960 } else {
2961 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2962 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2963 {
2964 Temp tmp = {ctx->program->allocateId(), v1};
2965 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2966 vec->operands[i] = Operand(tmp);
2967 }
2968 vec->definitions[0] = Definition(dst);
2969 ctx->block->instructions.emplace_back(std::move(vec));
2970 }
2971 }
2972
2973 unsigned get_num_channels_from_data_format(unsigned data_format)
2974 {
2975 switch (data_format) {
2976 case V_008F0C_BUF_DATA_FORMAT_8:
2977 case V_008F0C_BUF_DATA_FORMAT_16:
2978 case V_008F0C_BUF_DATA_FORMAT_32:
2979 return 1;
2980 case V_008F0C_BUF_DATA_FORMAT_8_8:
2981 case V_008F0C_BUF_DATA_FORMAT_16_16:
2982 case V_008F0C_BUF_DATA_FORMAT_32_32:
2983 return 2;
2984 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2985 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2986 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2987 return 3;
2988 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2989 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2990 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2991 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2992 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2993 return 4;
2994 default:
2995 break;
2996 }
2997
2998 return 4;
2999 }
3000
3001 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
3002 * so we may need to fix it up. */
3003 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
3004 {
3005 Builder bld(ctx->program, ctx->block);
3006
3007 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
3008 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
3009
3010 /* For the integer-like cases, do a natural sign extension.
3011 *
3012 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
3013 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
3014 * exponent.
3015 */
3016 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
3017 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
3018
3019 /* Convert back to the right type. */
3020 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
3021 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3022 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
3023 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
3024 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
3025 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3026 }
3027
3028 return alpha;
3029 }
3030
3031 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
3032 {
3033 Builder bld(ctx->program, ctx->block);
3034 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3035 if (ctx->stage & sw_vs) {
3036
3037 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3038 if (off_instr->type != nir_instr_type_load_const) {
3039 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3040 nir_print_instr(off_instr, stderr);
3041 fprintf(stderr, "\n");
3042 }
3043 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3044
3045 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
3046
3047 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3048 unsigned component = nir_intrinsic_component(instr);
3049 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3050 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3051 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3052 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3053
3054 unsigned dfmt = attrib_format & 0xf;
3055
3056 unsigned nfmt = (attrib_format >> 4) & 0x7;
3057 unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
3058 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3059 unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
3060 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3061 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3062 if (post_shuffle)
3063 num_channels = MAX2(num_channels, 3);
3064
3065 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
3066
3067 Temp index;
3068 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3069 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3070 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
3071 if (divisor) {
3072 ctx->needs_instance_id = true;
3073 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
3074 if (divisor != 1) {
3075 Temp divided = bld.tmp(v1);
3076 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
3077 index = bld.vadd32(bld.def(v1), start_instance, divided);
3078 } else {
3079 index = bld.vadd32(bld.def(v1), start_instance, instance_id);
3080 }
3081 } else {
3082 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
3083 }
3084 } else {
3085 index = bld.vadd32(bld.def(v1),
3086 get_arg(ctx, ctx->args->ac.base_vertex),
3087 get_arg(ctx, ctx->args->ac.vertex_id));
3088 }
3089
3090 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
3091 index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
3092 attrib_offset = attrib_offset % attrib_stride;
3093 }
3094
3095 Operand soffset(0u);
3096 if (attrib_offset >= 4096) {
3097 soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
3098 attrib_offset = 0;
3099 }
3100
3101 aco_opcode opcode;
3102 switch (num_channels) {
3103 case 1:
3104 opcode = aco_opcode::tbuffer_load_format_x;
3105 break;
3106 case 2:
3107 opcode = aco_opcode::tbuffer_load_format_xy;
3108 break;
3109 case 3:
3110 opcode = aco_opcode::tbuffer_load_format_xyz;
3111 break;
3112 case 4:
3113 opcode = aco_opcode::tbuffer_load_format_xyzw;
3114 break;
3115 default:
3116 unreachable("Unimplemented load_input vector size");
3117 }
3118
3119 Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
3120
3121 aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
3122 mubuf->operands[0] = Operand(index);
3123 mubuf->operands[1] = Operand(list);
3124 mubuf->operands[2] = soffset;
3125 mubuf->definitions[0] = Definition(tmp);
3126 mubuf->idxen = true;
3127 mubuf->can_reorder = true;
3128 mubuf->dfmt = dfmt;
3129 mubuf->nfmt = nfmt;
3130 assert(attrib_offset < 4096);
3131 mubuf->offset = attrib_offset;
3132 ctx->block->instructions.emplace_back(std::move(mubuf));
3133
3134 emit_split_vector(ctx, tmp, tmp.size());
3135
3136 if (tmp.id() != dst.id()) {
3137 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3138 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3139
3140 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3141 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3142 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3143
3144 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3145 for (unsigned i = 0; i < dst.size(); i++) {
3146 unsigned idx = i + component;
3147 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
3148 Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
3149 vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
3150 } else if (idx < num_channels) {
3151 vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
3152 } else if (is_float && idx == 3) {
3153 vec->operands[i] = Operand(0x3f800000u);
3154 } else if (!is_float && idx == 3) {
3155 vec->operands[i] = Operand(1u);
3156 } else {
3157 vec->operands[i] = Operand(0u);
3158 }
3159 }
3160 vec->definitions[0] = Definition(dst);
3161 ctx->block->instructions.emplace_back(std::move(vec));
3162 emit_split_vector(ctx, dst, dst.size());
3163 }
3164
3165 } else if (ctx->stage == fragment_fs) {
3166 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3167 if (off_instr->type != nir_instr_type_load_const ||
3168 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3169 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3170 nir_print_instr(off_instr, stderr);
3171 fprintf(stderr, "\n");
3172 }
3173
3174 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3175 nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
3176 if (offset) {
3177 assert(offset->u32 == 0);
3178 } else {
3179 /* the lower 15bit of the prim_mask contain the offset into LDS
3180 * while the upper bits contain the number of prims */
3181 Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
3182 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3183 Builder bld(ctx->program, ctx->block);
3184 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3185 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3186 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3187 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3188 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3189 }
3190
3191 unsigned idx = nir_intrinsic_base(instr);
3192 unsigned component = nir_intrinsic_component(instr);
3193
3194 if (dst.size() == 1) {
3195 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
3196 } else {
3197 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3198 for (unsigned i = 0; i < dst.size(); i++)
3199 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
3200 vec->definitions[0] = Definition(dst);
3201 bld.insert(std::move(vec));
3202 }
3203
3204 } else {
3205 unreachable("Shader stage not implemented");
3206 }
3207 }
3208
3209 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3210 {
3211 if (ctx->program->info->need_indirect_descriptor_sets) {
3212 Builder bld(ctx->program, ctx->block);
3213 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
3214 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
3215 }
3216
3217 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
3218 }
3219
3220
3221 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3222 {
3223 Builder bld(ctx->program, ctx->block);
3224 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3225 if (!ctx->divergent_vals[instr->dest.ssa.index])
3226 index = bld.as_uniform(index);
3227 unsigned desc_set = nir_intrinsic_desc_set(instr);
3228 unsigned binding = nir_intrinsic_binding(instr);
3229
3230 Temp desc_ptr;
3231 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3232 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3233 unsigned offset = layout->binding[binding].offset;
3234 unsigned stride;
3235 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3236 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3237 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3238 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
3239 offset = pipeline_layout->push_constant_size + 16 * idx;
3240 stride = 16;
3241 } else {
3242 desc_ptr = load_desc_ptr(ctx, desc_set);
3243 stride = layout->binding[binding].size;
3244 }
3245
3246 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3247 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3248 if (stride != 1) {
3249 if (nir_const_index) {
3250 const_index = const_index * stride;
3251 } else if (index.type() == RegType::vgpr) {
3252 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
3253 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
3254 } else {
3255 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
3256 }
3257 }
3258 if (offset) {
3259 if (nir_const_index) {
3260 const_index = const_index + offset;
3261 } else if (index.type() == RegType::vgpr) {
3262 index = bld.vadd32(bld.def(v1), Operand(offset), index);
3263 } else {
3264 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
3265 }
3266 }
3267
3268 if (nir_const_index && const_index == 0) {
3269 index = desc_ptr;
3270 } else if (index.type() == RegType::vgpr) {
3271 index = bld.vadd32(bld.def(v1),
3272 nir_const_index ? Operand(const_index) : Operand(index),
3273 Operand(desc_ptr));
3274 } else {
3275 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3276 nir_const_index ? Operand(const_index) : Operand(index),
3277 Operand(desc_ptr));
3278 }
3279
3280 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
3281 }
3282
3283 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
3284 Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
3285 {
3286 Builder bld(ctx->program, ctx->block);
3287
3288 unsigned num_bytes = dst.size() * 4;
3289 bool dlc = glc && ctx->options->chip_class >= GFX10;
3290
3291 aco_opcode op;
3292 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3293 if (ctx->options->chip_class < GFX8)
3294 offset = as_vgpr(ctx, offset);
3295
3296 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3297 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3298 unsigned const_offset = 0;
3299
3300 Temp lower = Temp();
3301 if (num_bytes > 16) {
3302 assert(num_components == 3 || num_components == 4);
3303 op = aco_opcode::buffer_load_dwordx4;
3304 lower = bld.tmp(v4);
3305 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3306 mubuf->definitions[0] = Definition(lower);
3307 mubuf->operands[0] = vaddr;
3308 mubuf->operands[1] = Operand(rsrc);
3309 mubuf->operands[2] = soffset;
3310 mubuf->offen = (offset.type() == RegType::vgpr);
3311 mubuf->glc = glc;
3312 mubuf->dlc = dlc;
3313 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3314 mubuf->can_reorder = readonly;
3315 bld.insert(std::move(mubuf));
3316 emit_split_vector(ctx, lower, 2);
3317 num_bytes -= 16;
3318 const_offset = 16;
3319 }
3320
3321 switch (num_bytes) {
3322 case 4:
3323 op = aco_opcode::buffer_load_dword;
3324 break;
3325 case 8:
3326 op = aco_opcode::buffer_load_dwordx2;
3327 break;
3328 case 12:
3329 op = aco_opcode::buffer_load_dwordx3;
3330 break;
3331 case 16:
3332 op = aco_opcode::buffer_load_dwordx4;
3333 break;
3334 default:
3335 unreachable("Load SSBO not implemented for this size.");
3336 }
3337 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3338 mubuf->operands[0] = vaddr;
3339 mubuf->operands[1] = Operand(rsrc);
3340 mubuf->operands[2] = soffset;
3341 mubuf->offen = (offset.type() == RegType::vgpr);
3342 mubuf->glc = glc;
3343 mubuf->dlc = dlc;
3344 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3345 mubuf->can_reorder = readonly;
3346 mubuf->offset = const_offset;
3347 aco_ptr<Instruction> instr = std::move(mubuf);
3348
3349 if (dst.size() > 4) {
3350 assert(lower != Temp());
3351 Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3352 instr->definitions[0] = Definition(upper);
3353 bld.insert(std::move(instr));
3354 if (dst.size() == 8)
3355 emit_split_vector(ctx, upper, 2);
3356 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3357 instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3358 instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3359 instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3360 if (dst.size() == 8)
3361 instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3362 }
3363
3364 if (dst.type() == RegType::sgpr) {
3365 Temp vec = bld.tmp(RegType::vgpr, dst.size());
3366 instr->definitions[0] = Definition(vec);
3367 bld.insert(std::move(instr));
3368 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3369 } else {
3370 instr->definitions[0] = Definition(dst);
3371 bld.insert(std::move(instr));
3372 }
3373 } else {
3374 switch (num_bytes) {
3375 case 4:
3376 op = aco_opcode::s_buffer_load_dword;
3377 break;
3378 case 8:
3379 op = aco_opcode::s_buffer_load_dwordx2;
3380 break;
3381 case 12:
3382 case 16:
3383 op = aco_opcode::s_buffer_load_dwordx4;
3384 break;
3385 case 24:
3386 case 32:
3387 op = aco_opcode::s_buffer_load_dwordx8;
3388 break;
3389 default:
3390 unreachable("Load SSBO not implemented for this size.");
3391 }
3392 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3393 load->operands[0] = Operand(rsrc);
3394 load->operands[1] = Operand(bld.as_uniform(offset));
3395 assert(load->operands[1].getTemp().type() == RegType::sgpr);
3396 load->definitions[0] = Definition(dst);
3397 load->glc = glc;
3398 load->dlc = dlc;
3399 load->barrier = readonly ? barrier_none : barrier_buffer;
3400 load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3401 assert(ctx->options->chip_class >= GFX8 || !glc);
3402
3403 /* trim vector */
3404 if (dst.size() == 3) {
3405 Temp vec = bld.tmp(s4);
3406 load->definitions[0] = Definition(vec);
3407 bld.insert(std::move(load));
3408 emit_split_vector(ctx, vec, 4);
3409
3410 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3411 emit_extract_vector(ctx, vec, 0, s1),
3412 emit_extract_vector(ctx, vec, 1, s1),
3413 emit_extract_vector(ctx, vec, 2, s1));
3414 } else if (dst.size() == 6) {
3415 Temp vec = bld.tmp(s8);
3416 load->definitions[0] = Definition(vec);
3417 bld.insert(std::move(load));
3418 emit_split_vector(ctx, vec, 4);
3419
3420 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3421 emit_extract_vector(ctx, vec, 0, s2),
3422 emit_extract_vector(ctx, vec, 1, s2),
3423 emit_extract_vector(ctx, vec, 2, s2));
3424 } else {
3425 bld.insert(std::move(load));
3426 }
3427
3428 }
3429 emit_split_vector(ctx, dst, num_components);
3430 }
3431
3432 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3433 {
3434 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3435 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3436
3437 Builder bld(ctx->program, ctx->block);
3438
3439 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3440 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3441 unsigned binding = nir_intrinsic_binding(idx_instr);
3442 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3443
3444 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3445 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3446 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3447 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3448 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3449 if (ctx->options->chip_class >= GFX10) {
3450 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3451 S_008F0C_OOB_SELECT(3) |
3452 S_008F0C_RESOURCE_LEVEL(1);
3453 } else {
3454 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3455 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3456 }
3457 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3458 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3459 Operand(0xFFFFFFFFu),
3460 Operand(desc_type));
3461 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3462 rsrc, upper_dwords);
3463 } else {
3464 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3465 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3466 }
3467
3468 load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3469 }
3470
3471 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3472 {
3473 Builder bld(ctx->program, ctx->block);
3474 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3475
3476 unsigned offset = nir_intrinsic_base(instr);
3477 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3478 if (index_cv && instr->dest.ssa.bit_size == 32) {
3479
3480 unsigned count = instr->dest.ssa.num_components;
3481 unsigned start = (offset + index_cv->u32) / 4u;
3482 start -= ctx->args->ac.base_inline_push_consts;
3483 if (start + count <= ctx->args->ac.num_inline_push_consts) {
3484 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3485 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3486 for (unsigned i = 0; i < count; ++i) {
3487 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
3488 vec->operands[i] = Operand{elems[i]};
3489 }
3490 vec->definitions[0] = Definition(dst);
3491 ctx->block->instructions.emplace_back(std::move(vec));
3492 ctx->allocated_vec.emplace(dst.id(), elems);
3493 return;
3494 }
3495 }
3496
3497 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3498 if (offset != 0) // TODO check if index != 0 as well
3499 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3500 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
3501 Temp vec = dst;
3502 bool trim = false;
3503 aco_opcode op;
3504
3505 switch (dst.size()) {
3506 case 1:
3507 op = aco_opcode::s_load_dword;
3508 break;
3509 case 2:
3510 op = aco_opcode::s_load_dwordx2;
3511 break;
3512 case 3:
3513 vec = bld.tmp(s4);
3514 trim = true;
3515 case 4:
3516 op = aco_opcode::s_load_dwordx4;
3517 break;
3518 case 6:
3519 vec = bld.tmp(s8);
3520 trim = true;
3521 case 8:
3522 op = aco_opcode::s_load_dwordx8;
3523 break;
3524 default:
3525 unreachable("unimplemented or forbidden load_push_constant.");
3526 }
3527
3528 bld.smem(op, Definition(vec), ptr, index);
3529
3530 if (trim) {
3531 emit_split_vector(ctx, vec, 4);
3532 RegClass rc = dst.size() == 3 ? s1 : s2;
3533 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3534 emit_extract_vector(ctx, vec, 0, rc),
3535 emit_extract_vector(ctx, vec, 1, rc),
3536 emit_extract_vector(ctx, vec, 2, rc));
3537
3538 }
3539 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3540 }
3541
3542 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3543 {
3544 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3545
3546 Builder bld(ctx->program, ctx->block);
3547
3548 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3549 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3550 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3551 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3552 if (ctx->options->chip_class >= GFX10) {
3553 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3554 S_008F0C_OOB_SELECT(3) |
3555 S_008F0C_RESOURCE_LEVEL(1);
3556 } else {
3557 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3558 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3559 }
3560
3561 unsigned base = nir_intrinsic_base(instr);
3562 unsigned range = nir_intrinsic_range(instr);
3563
3564 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3565 if (base && offset.type() == RegType::sgpr)
3566 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3567 else if (base && offset.type() == RegType::vgpr)
3568 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3569
3570 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3571 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3572 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3573 Operand(desc_type));
3574
3575 load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3576 }
3577
3578 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3579 {
3580 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3581 ctx->cf_info.exec_potentially_empty = true;
3582
3583 ctx->program->needs_exact = true;
3584
3585 // TODO: optimize uniform conditions
3586 Builder bld(ctx->program, ctx->block);
3587 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3588 assert(src.regClass() == s2);
3589 src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3590 bld.pseudo(aco_opcode::p_discard_if, src);
3591 ctx->block->kind |= block_kind_uses_discard_if;
3592 return;
3593 }
3594
3595 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3596 {
3597 Builder bld(ctx->program, ctx->block);
3598
3599 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3600 ctx->cf_info.exec_potentially_empty = true;
3601
3602 bool divergent = ctx->cf_info.parent_if.is_divergent ||
3603 ctx->cf_info.parent_loop.has_divergent_continue;
3604
3605 if (ctx->block->loop_nest_depth &&
3606 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3607 /* we handle discards the same way as jump instructions */
3608 append_logical_end(ctx->block);
3609
3610 /* in loops, discard behaves like break */
3611 Block *linear_target = ctx->cf_info.parent_loop.exit;
3612 ctx->block->kind |= block_kind_discard;
3613
3614 if (!divergent) {
3615 /* uniform discard - loop ends here */
3616 assert(nir_instr_is_last(&instr->instr));
3617 ctx->block->kind |= block_kind_uniform;
3618 ctx->cf_info.has_branch = true;
3619 bld.branch(aco_opcode::p_branch);
3620 add_linear_edge(ctx->block->index, linear_target);
3621 return;
3622 }
3623
3624 /* we add a break right behind the discard() instructions */
3625 ctx->block->kind |= block_kind_break;
3626 unsigned idx = ctx->block->index;
3627
3628 /* remove critical edges from linear CFG */
3629 bld.branch(aco_opcode::p_branch);
3630 Block* break_block = ctx->program->create_and_insert_block();
3631 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3632 break_block->kind |= block_kind_uniform;
3633 add_linear_edge(idx, break_block);
3634 add_linear_edge(break_block->index, linear_target);
3635 bld.reset(break_block);
3636 bld.branch(aco_opcode::p_branch);
3637
3638 Block* continue_block = ctx->program->create_and_insert_block();
3639 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3640 add_linear_edge(idx, continue_block);
3641 append_logical_start(continue_block);
3642 ctx->block = continue_block;
3643
3644 return;
3645 }
3646
3647 /* it can currently happen that NIR doesn't remove the unreachable code */
3648 if (!nir_instr_is_last(&instr->instr)) {
3649 ctx->program->needs_exact = true;
3650 /* save exec somewhere temporarily so that it doesn't get
3651 * overwritten before the discard from outer exec masks */
3652 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3653 bld.pseudo(aco_opcode::p_discard_if, cond);
3654 ctx->block->kind |= block_kind_uses_discard_if;
3655 return;
3656 }
3657
3658 /* This condition is incorrect for uniformly branched discards in a loop
3659 * predicated by a divergent condition, but the above code catches that case
3660 * and the discard would end up turning into a discard_if.
3661 * For example:
3662 * if (divergent) {
3663 * while (...) {
3664 * if (uniform) {
3665 * discard;
3666 * }
3667 * }
3668 * }
3669 */
3670 if (!ctx->cf_info.parent_if.is_divergent) {
3671 /* program just ends here */
3672 ctx->block->kind |= block_kind_uniform;
3673 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3674 0 /* enabled mask */, 9 /* dest */,
3675 false /* compressed */, true/* done */, true /* valid mask */);
3676 bld.sopp(aco_opcode::s_endpgm);
3677 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3678 } else {
3679 ctx->block->kind |= block_kind_discard;
3680 /* branch and linear edge is added by visit_if() */
3681 }
3682 }
3683
3684 enum aco_descriptor_type {
3685 ACO_DESC_IMAGE,
3686 ACO_DESC_FMASK,
3687 ACO_DESC_SAMPLER,
3688 ACO_DESC_BUFFER,
3689 ACO_DESC_PLANE_0,
3690 ACO_DESC_PLANE_1,
3691 ACO_DESC_PLANE_2,
3692 };
3693
3694 static bool
3695 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3696 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3697 return false;
3698 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3699 return dim == ac_image_cube ||
3700 dim == ac_image_1darray ||
3701 dim == ac_image_2darray ||
3702 dim == ac_image_2darraymsaa;
3703 }
3704
3705 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3706 enum aco_descriptor_type desc_type,
3707 const nir_tex_instr *tex_instr, bool image, bool write)
3708 {
3709 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3710 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3711 if (it != ctx->tex_desc.end())
3712 return it->second;
3713 */
3714 Temp index = Temp();
3715 bool index_set = false;
3716 unsigned constant_index = 0;
3717 unsigned descriptor_set;
3718 unsigned base_index;
3719 Builder bld(ctx->program, ctx->block);
3720
3721 if (!deref_instr) {
3722 assert(tex_instr && !image);
3723 descriptor_set = 0;
3724 base_index = tex_instr->sampler_index;
3725 } else {
3726 while(deref_instr->deref_type != nir_deref_type_var) {
3727 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3728 if (!array_size)
3729 array_size = 1;
3730
3731 assert(deref_instr->deref_type == nir_deref_type_array);
3732 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3733 if (const_value) {
3734 constant_index += array_size * const_value->u32;
3735 } else {
3736 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3737 if (indirect.type() == RegType::vgpr)
3738 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3739
3740 if (array_size != 1)
3741 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3742
3743 if (!index_set) {
3744 index = indirect;
3745 index_set = true;
3746 } else {
3747 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3748 }
3749 }
3750
3751 deref_instr = nir_src_as_deref(deref_instr->parent);
3752 }
3753 descriptor_set = deref_instr->var->data.descriptor_set;
3754 base_index = deref_instr->var->data.binding;
3755 }
3756
3757 Temp list = load_desc_ptr(ctx, descriptor_set);
3758 list = convert_pointer_to_64_bit(ctx, list);
3759
3760 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3761 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3762 unsigned offset = binding->offset;
3763 unsigned stride = binding->size;
3764 aco_opcode opcode;
3765 RegClass type;
3766
3767 assert(base_index < layout->binding_count);
3768
3769 switch (desc_type) {
3770 case ACO_DESC_IMAGE:
3771 type = s8;
3772 opcode = aco_opcode::s_load_dwordx8;
3773 break;
3774 case ACO_DESC_FMASK:
3775 type = s8;
3776 opcode = aco_opcode::s_load_dwordx8;
3777 offset += 32;
3778 break;
3779 case ACO_DESC_SAMPLER:
3780 type = s4;
3781 opcode = aco_opcode::s_load_dwordx4;
3782 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3783 offset += radv_combined_image_descriptor_sampler_offset(binding);
3784 break;
3785 case ACO_DESC_BUFFER:
3786 type = s4;
3787 opcode = aco_opcode::s_load_dwordx4;
3788 break;
3789 case ACO_DESC_PLANE_0:
3790 case ACO_DESC_PLANE_1:
3791 type = s8;
3792 opcode = aco_opcode::s_load_dwordx8;
3793 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3794 break;
3795 case ACO_DESC_PLANE_2:
3796 type = s4;
3797 opcode = aco_opcode::s_load_dwordx4;
3798 offset += 64;
3799 break;
3800 default:
3801 unreachable("invalid desc_type\n");
3802 }
3803
3804 offset += constant_index * stride;
3805
3806 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3807 (!index_set || binding->immutable_samplers_equal)) {
3808 if (binding->immutable_samplers_equal)
3809 constant_index = 0;
3810
3811 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3812 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3813 Operand(samplers[constant_index * 4 + 0]),
3814 Operand(samplers[constant_index * 4 + 1]),
3815 Operand(samplers[constant_index * 4 + 2]),
3816 Operand(samplers[constant_index * 4 + 3]));
3817 }
3818
3819 Operand off;
3820 if (!index_set) {
3821 off = Operand(offset);
3822 } else {
3823 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3824 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3825 }
3826
3827 Temp res = bld.smem(opcode, bld.def(type), list, off);
3828
3829 if (desc_type == ACO_DESC_PLANE_2) {
3830 Temp components[8];
3831 for (unsigned i = 0; i < 8; i++)
3832 components[i] = bld.tmp(s1);
3833 bld.pseudo(aco_opcode::p_split_vector,
3834 Definition(components[0]),
3835 Definition(components[1]),
3836 Definition(components[2]),
3837 Definition(components[3]),
3838 res);
3839
3840 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3841 bld.pseudo(aco_opcode::p_split_vector,
3842 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3843 Definition(components[4]),
3844 Definition(components[5]),
3845 Definition(components[6]),
3846 Definition(components[7]),
3847 desc2);
3848
3849 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3850 components[0], components[1], components[2], components[3],
3851 components[4], components[5], components[6], components[7]);
3852 }
3853
3854 return res;
3855 }
3856
3857 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3858 {
3859 switch (dim) {
3860 case GLSL_SAMPLER_DIM_BUF:
3861 return 1;
3862 case GLSL_SAMPLER_DIM_1D:
3863 return array ? 2 : 1;
3864 case GLSL_SAMPLER_DIM_2D:
3865 return array ? 3 : 2;
3866 case GLSL_SAMPLER_DIM_MS:
3867 return array ? 4 : 3;
3868 case GLSL_SAMPLER_DIM_3D:
3869 case GLSL_SAMPLER_DIM_CUBE:
3870 return 3;
3871 case GLSL_SAMPLER_DIM_RECT:
3872 case GLSL_SAMPLER_DIM_SUBPASS:
3873 return 2;
3874 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3875 return 3;
3876 default:
3877 break;
3878 }
3879 return 0;
3880 }
3881
3882
3883 /* Adjust the sample index according to FMASK.
3884 *
3885 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3886 * which is the identity mapping. Each nibble says which physical sample
3887 * should be fetched to get that sample.
3888 *
3889 * For example, 0x11111100 means there are only 2 samples stored and
3890 * the second sample covers 3/4 of the pixel. When reading samples 0
3891 * and 1, return physical sample 0 (determined by the first two 0s
3892 * in FMASK), otherwise return physical sample 1.
3893 *
3894 * The sample index should be adjusted as follows:
3895 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
3896 */
3897 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3898 {
3899 Builder bld(ctx->program, ctx->block);
3900 Temp fmask = bld.tmp(v1);
3901 unsigned dim = ctx->options->chip_class >= GFX10
3902 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3903 : 0;
3904
3905 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3906 load->operands[0] = Operand(coords);
3907 load->operands[1] = Operand(fmask_desc_ptr);
3908 load->definitions[0] = Definition(fmask);
3909 load->glc = false;
3910 load->dlc = false;
3911 load->dmask = 0x1;
3912 load->unrm = true;
3913 load->da = da;
3914 load->dim = dim;
3915 load->can_reorder = true; /* fmask images shouldn't be modified */
3916 ctx->block->instructions.emplace_back(std::move(load));
3917
3918 Operand sample_index4;
3919 if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3920 sample_index4 = Operand(sample_index.constantValue() << 2);
3921 } else if (sample_index.regClass() == s1) {
3922 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3923 } else {
3924 assert(sample_index.regClass() == v1);
3925 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3926 }
3927
3928 Temp final_sample;
3929 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3930 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3931 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3932 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3933 else
3934 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3935
3936 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3937 * resource descriptor is 0 (invalid),
3938 */
3939 Temp compare = bld.tmp(s2);
3940 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3941 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3942
3943 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3944
3945 /* Replace the MSAA sample index. */
3946 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3947 }
3948
3949 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3950 {
3951
3952 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3953 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3954 bool is_array = glsl_sampler_type_is_array(type);
3955 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3956 assert(!add_frag_pos && "Input attachments should be lowered.");
3957 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3958 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3959 int count = image_type_to_components_count(dim, is_array);
3960 std::vector<Operand> coords(count);
3961
3962 if (is_ms) {
3963 Operand sample_index;
3964 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3965 if (sample_cv)
3966 sample_index = Operand(sample_cv->u32);
3967 else
3968 sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3969
3970 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3971 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3972 for (unsigned i = 0; i < vec->operands.size(); i++)
3973 vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3974 Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3975 vec->definitions[0] = Definition(fmask_load_address);
3976 ctx->block->instructions.emplace_back(std::move(vec));
3977
3978 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3979 sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3980 }
3981 count--;
3982 coords[count] = sample_index;
3983 }
3984
3985 if (count == 1 && !gfx9_1d)
3986 return emit_extract_vector(ctx, src0, 0, v1);
3987
3988 if (gfx9_1d) {
3989 coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3990 coords.resize(coords.size() + 1);
3991 coords[1] = Operand((uint32_t) 0);
3992 if (is_array)
3993 coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3994 } else {
3995 for (int i = 0; i < count; i++)
3996 coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3997 }
3998
3999 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
4000 for (unsigned i = 0; i < coords.size(); i++)
4001 vec->operands[i] = coords[i];
4002 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
4003 vec->definitions[0] = Definition(res);
4004 ctx->block->instructions.emplace_back(std::move(vec));
4005 return res;
4006 }
4007
4008
4009 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
4010 {
4011 Builder bld(ctx->program, ctx->block);
4012 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4013 const struct glsl_type *type = glsl_without_array(var->type);
4014 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4015 bool is_array = glsl_sampler_type_is_array(type);
4016 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4017
4018 if (dim == GLSL_SAMPLER_DIM_BUF) {
4019 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
4020 unsigned num_channels = util_last_bit(mask);
4021 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4022 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4023
4024 aco_opcode opcode;
4025 switch (num_channels) {
4026 case 1:
4027 opcode = aco_opcode::buffer_load_format_x;
4028 break;
4029 case 2:
4030 opcode = aco_opcode::buffer_load_format_xy;
4031 break;
4032 case 3:
4033 opcode = aco_opcode::buffer_load_format_xyz;
4034 break;
4035 case 4:
4036 opcode = aco_opcode::buffer_load_format_xyzw;
4037 break;
4038 default:
4039 unreachable(">4 channel buffer image load");
4040 }
4041 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
4042 load->operands[0] = Operand(vindex);
4043 load->operands[1] = Operand(rsrc);
4044 load->operands[2] = Operand((uint32_t) 0);
4045 Temp tmp;
4046 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4047 tmp = dst;
4048 else
4049 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4050 load->definitions[0] = Definition(tmp);
4051 load->idxen = true;
4052 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
4053 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4054 load->barrier = barrier_image;
4055 ctx->block->instructions.emplace_back(std::move(load));
4056
4057 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4058 return;
4059 }
4060
4061 Temp coords = get_image_coords(ctx, instr, type);
4062 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4063
4064 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4065 unsigned num_components = util_bitcount(dmask);
4066 Temp tmp;
4067 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4068 tmp = dst;
4069 else
4070 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4071
4072 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
4073 load->operands[0] = Operand(coords);
4074 load->operands[1] = Operand(resource);
4075 load->definitions[0] = Definition(tmp);
4076 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4077 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4078 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4079 load->dmask = dmask;
4080 load->unrm = true;
4081 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4082 load->barrier = barrier_image;
4083 ctx->block->instructions.emplace_back(std::move(load));
4084
4085 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4086 return;
4087 }
4088
4089 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4090 {
4091 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4092 const struct glsl_type *type = glsl_without_array(var->type);
4093 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4094 bool is_array = glsl_sampler_type_is_array(type);
4095 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4096
4097 bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4098
4099 if (dim == GLSL_SAMPLER_DIM_BUF) {
4100 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4101 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4102 aco_opcode opcode;
4103 switch (data.size()) {
4104 case 1:
4105 opcode = aco_opcode::buffer_store_format_x;
4106 break;
4107 case 2:
4108 opcode = aco_opcode::buffer_store_format_xy;
4109 break;
4110 case 3:
4111 opcode = aco_opcode::buffer_store_format_xyz;
4112 break;
4113 case 4:
4114 opcode = aco_opcode::buffer_store_format_xyzw;
4115 break;
4116 default:
4117 unreachable(">4 channel buffer image store");
4118 }
4119 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4120 store->operands[0] = Operand(vindex);
4121 store->operands[1] = Operand(rsrc);
4122 store->operands[2] = Operand((uint32_t) 0);
4123 store->operands[3] = Operand(data);
4124 store->idxen = true;
4125 store->glc = glc;
4126 store->dlc = false;
4127 store->disable_wqm = true;
4128 store->barrier = barrier_image;
4129 ctx->program->needs_exact = true;
4130 ctx->block->instructions.emplace_back(std::move(store));
4131 return;
4132 }
4133
4134 assert(data.type() == RegType::vgpr);
4135 Temp coords = get_image_coords(ctx, instr, type);
4136 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4137
4138 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
4139 store->operands[0] = Operand(coords);
4140 store->operands[1] = Operand(resource);
4141 store->operands[2] = Operand(s4);
4142 store->operands[3] = Operand(data);
4143 store->glc = glc;
4144 store->dlc = false;
4145 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4146 store->dmask = (1 << data.size()) - 1;
4147 store->unrm = true;
4148 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4149 store->disable_wqm = true;
4150 store->barrier = barrier_image;
4151 ctx->program->needs_exact = true;
4152 ctx->block->instructions.emplace_back(std::move(store));
4153 return;
4154 }
4155
4156 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4157 {
4158 /* return the previous value if dest is ever used */
4159 bool return_previous = false;
4160 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4161 return_previous = true;
4162 break;
4163 }
4164 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4165 return_previous = true;
4166 break;
4167 }
4168
4169 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4170 const struct glsl_type *type = glsl_without_array(var->type);
4171 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4172 bool is_array = glsl_sampler_type_is_array(type);
4173 Builder bld(ctx->program, ctx->block);
4174
4175 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4176 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4177
4178 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4179 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4180
4181 aco_opcode buf_op, image_op;
4182 switch (instr->intrinsic) {
4183 case nir_intrinsic_image_deref_atomic_add:
4184 buf_op = aco_opcode::buffer_atomic_add;
4185 image_op = aco_opcode::image_atomic_add;
4186 break;
4187 case nir_intrinsic_image_deref_atomic_umin:
4188 buf_op = aco_opcode::buffer_atomic_umin;
4189 image_op = aco_opcode::image_atomic_umin;
4190 break;
4191 case nir_intrinsic_image_deref_atomic_imin:
4192 buf_op = aco_opcode::buffer_atomic_smin;
4193 image_op = aco_opcode::image_atomic_smin;
4194 break;
4195 case nir_intrinsic_image_deref_atomic_umax:
4196 buf_op = aco_opcode::buffer_atomic_umax;
4197 image_op = aco_opcode::image_atomic_umax;
4198 break;
4199 case nir_intrinsic_image_deref_atomic_imax:
4200 buf_op = aco_opcode::buffer_atomic_smax;
4201 image_op = aco_opcode::image_atomic_smax;
4202 break;
4203 case nir_intrinsic_image_deref_atomic_and:
4204 buf_op = aco_opcode::buffer_atomic_and;
4205 image_op = aco_opcode::image_atomic_and;
4206 break;
4207 case nir_intrinsic_image_deref_atomic_or:
4208 buf_op = aco_opcode::buffer_atomic_or;
4209 image_op = aco_opcode::image_atomic_or;
4210 break;
4211 case nir_intrinsic_image_deref_atomic_xor:
4212 buf_op = aco_opcode::buffer_atomic_xor;
4213 image_op = aco_opcode::image_atomic_xor;
4214 break;
4215 case nir_intrinsic_image_deref_atomic_exchange:
4216 buf_op = aco_opcode::buffer_atomic_swap;
4217 image_op = aco_opcode::image_atomic_swap;
4218 break;
4219 case nir_intrinsic_image_deref_atomic_comp_swap:
4220 buf_op = aco_opcode::buffer_atomic_cmpswap;
4221 image_op = aco_opcode::image_atomic_cmpswap;
4222 break;
4223 default:
4224 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4225 }
4226
4227 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4228
4229 if (dim == GLSL_SAMPLER_DIM_BUF) {
4230 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4231 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4232 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
4233 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4234 mubuf->operands[0] = Operand(vindex);
4235 mubuf->operands[1] = Operand(resource);
4236 mubuf->operands[2] = Operand((uint32_t)0);
4237 mubuf->operands[3] = Operand(data);
4238 if (return_previous)
4239 mubuf->definitions[0] = Definition(dst);
4240 mubuf->offset = 0;
4241 mubuf->idxen = true;
4242 mubuf->glc = return_previous;
4243 mubuf->dlc = false; /* Not needed for atomics */
4244 mubuf->disable_wqm = true;
4245 mubuf->barrier = barrier_image;
4246 ctx->program->needs_exact = true;
4247 ctx->block->instructions.emplace_back(std::move(mubuf));
4248 return;
4249 }
4250
4251 Temp coords = get_image_coords(ctx, instr, type);
4252 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4253 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
4254 mimg->operands[0] = Operand(coords);
4255 mimg->operands[1] = Operand(resource);
4256 mimg->operands[2] = Operand(s4); /* no sampler */
4257 mimg->operands[3] = Operand(data);
4258 if (return_previous)
4259 mimg->definitions[0] = Definition(dst);
4260 mimg->glc = return_previous;
4261 mimg->dlc = false; /* Not needed for atomics */
4262 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4263 mimg->dmask = (1 << data.size()) - 1;
4264 mimg->unrm = true;
4265 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4266 mimg->disable_wqm = true;
4267 mimg->barrier = barrier_image;
4268 ctx->program->needs_exact = true;
4269 ctx->block->instructions.emplace_back(std::move(mimg));
4270 return;
4271 }
4272
4273 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
4274 {
4275 if (in_elements && ctx->options->chip_class == GFX8) {
4276 Builder bld(ctx->program, ctx->block);
4277
4278 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
4279 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
4280 stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
4281 stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
4282
4283 Temp size = emit_extract_vector(ctx, desc, 2, s1);
4284 size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
4285
4286 Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4287 res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4288 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4289
4290 // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4291 /* idea
4292 * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4293 * in case 12 (or 3?), we have to divide by 3:
4294 * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4295 * use v_mul_hi_u32 with magic number to divide
4296 * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4297 * disable v_skip
4298 * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4299 */
4300
4301 } else {
4302 emit_extract_vector(ctx, desc, 2, dst);
4303 }
4304 }
4305
4306 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4307 {
4308 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4309 const struct glsl_type *type = glsl_without_array(var->type);
4310 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4311 bool is_array = glsl_sampler_type_is_array(type);
4312 Builder bld(ctx->program, ctx->block);
4313
4314 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4315 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4316 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4317 }
4318
4319 /* LOD */
4320 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4321
4322 /* Resource */
4323 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4324
4325 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4326
4327 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4328 mimg->operands[0] = Operand(lod);
4329 mimg->operands[1] = Operand(resource);
4330 unsigned& dmask = mimg->dmask;
4331 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4332 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4333 mimg->da = glsl_sampler_type_is_array(type);
4334 mimg->can_reorder = true;
4335 Definition& def = mimg->definitions[0];
4336 ctx->block->instructions.emplace_back(std::move(mimg));
4337
4338 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4339 glsl_sampler_type_is_array(type)) {
4340
4341 assert(instr->dest.ssa.num_components == 3);
4342 Temp tmp = {ctx->program->allocateId(), v3};
4343 def = Definition(tmp);
4344 emit_split_vector(ctx, tmp, 3);
4345
4346 /* divide 3rd value by 6 by multiplying with magic number */
4347 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4348 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4349
4350 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4351 emit_extract_vector(ctx, tmp, 0, v1),
4352 emit_extract_vector(ctx, tmp, 1, v1),
4353 by_6);
4354
4355 } else if (ctx->options->chip_class == GFX9 &&
4356 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4357 glsl_sampler_type_is_array(type)) {
4358 assert(instr->dest.ssa.num_components == 2);
4359 def = Definition(dst);
4360 dmask = 0x5;
4361 } else {
4362 def = Definition(dst);
4363 }
4364
4365 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4366 }
4367
4368 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4369 {
4370 Builder bld(ctx->program, ctx->block);
4371 unsigned num_components = instr->num_components;
4372
4373 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4374 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4375 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4376
4377 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4378 load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
4379 }
4380
4381 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4382 {
4383 Builder bld(ctx->program, ctx->block);
4384 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4385 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4386 unsigned writemask = nir_intrinsic_write_mask(instr);
4387
4388 Temp offset;
4389 if (ctx->options->chip_class < GFX8)
4390 offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4391 else
4392 offset = get_ssa_temp(ctx, instr->src[2].ssa);
4393
4394 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4395 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4396
4397 bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4398 ctx->options->chip_class >= GFX8;
4399 if (smem)
4400 offset = bld.as_uniform(offset);
4401 bool smem_nonfs = smem && ctx->stage != fragment_fs;
4402
4403 while (writemask) {
4404 int start, count;
4405 u_bit_scan_consecutive_range(&writemask, &start, &count);
4406 if (count == 3 && smem) {
4407 writemask |= 1u << (start + 2);
4408 count = 2;
4409 }
4410 int num_bytes = count * elem_size_bytes;
4411
4412 if (num_bytes > 16) {
4413 assert(elem_size_bytes == 8);
4414 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4415 count = 2;
4416 num_bytes = 16;
4417 }
4418
4419 // TODO: check alignment of sub-dword stores
4420 // TODO: split 3 bytes. there is no store instruction for that
4421
4422 Temp write_data;
4423 if (count != instr->num_components) {
4424 emit_split_vector(ctx, data, instr->num_components);
4425 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4426 for (int i = 0; i < count; i++) {
4427 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4428 vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4429 }
4430 write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4431 vec->definitions[0] = Definition(write_data);
4432 ctx->block->instructions.emplace_back(std::move(vec));
4433 } else if (!smem && data.type() != RegType::vgpr) {
4434 assert(num_bytes % 4 == 0);
4435 write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4436 } else if (smem_nonfs && data.type() == RegType::vgpr) {
4437 assert(num_bytes % 4 == 0);
4438 write_data = bld.as_uniform(data);
4439 } else {
4440 write_data = data;
4441 }
4442
4443 aco_opcode vmem_op, smem_op;
4444 switch (num_bytes) {
4445 case 4:
4446 vmem_op = aco_opcode::buffer_store_dword;
4447 smem_op = aco_opcode::s_buffer_store_dword;
4448 break;
4449 case 8:
4450 vmem_op = aco_opcode::buffer_store_dwordx2;
4451 smem_op = aco_opcode::s_buffer_store_dwordx2;
4452 break;
4453 case 12:
4454 vmem_op = aco_opcode::buffer_store_dwordx3;
4455 smem_op = aco_opcode::last_opcode;
4456 assert(!smem);
4457 break;
4458 case 16:
4459 vmem_op = aco_opcode::buffer_store_dwordx4;
4460 smem_op = aco_opcode::s_buffer_store_dwordx4;
4461 break;
4462 default:
4463 unreachable("Store SSBO not implemented for this size.");
4464 }
4465 if (ctx->stage == fragment_fs)
4466 smem_op = aco_opcode::p_fs_buffer_store_smem;
4467
4468 if (smem) {
4469 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4470 store->operands[0] = Operand(rsrc);
4471 if (start) {
4472 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4473 offset, Operand(start * elem_size_bytes));
4474 store->operands[1] = Operand(off);
4475 } else {
4476 store->operands[1] = Operand(offset);
4477 }
4478 if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4479 store->operands[1].setFixed(m0);
4480 store->operands[2] = Operand(write_data);
4481 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4482 store->dlc = false;
4483 store->disable_wqm = true;
4484 store->barrier = barrier_buffer;
4485 ctx->block->instructions.emplace_back(std::move(store));
4486 ctx->program->wb_smem_l1_on_end = true;
4487 if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4488 ctx->block->kind |= block_kind_needs_lowering;
4489 ctx->program->needs_exact = true;
4490 }
4491 } else {
4492 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4493 store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4494 store->operands[1] = Operand(rsrc);
4495 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4496 store->operands[3] = Operand(write_data);
4497 store->offset = start * elem_size_bytes;
4498 store->offen = (offset.type() == RegType::vgpr);
4499 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4500 store->dlc = false;
4501 store->disable_wqm = true;
4502 store->barrier = barrier_buffer;
4503 ctx->program->needs_exact = true;
4504 ctx->block->instructions.emplace_back(std::move(store));
4505 }
4506 }
4507 }
4508
4509 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4510 {
4511 /* return the previous value if dest is ever used */
4512 bool return_previous = false;
4513 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4514 return_previous = true;
4515 break;
4516 }
4517 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4518 return_previous = true;
4519 break;
4520 }
4521
4522 Builder bld(ctx->program, ctx->block);
4523 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4524
4525 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4526 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4527 get_ssa_temp(ctx, instr->src[3].ssa), data);
4528
4529 Temp offset;
4530 if (ctx->options->chip_class < GFX8)
4531 offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4532 else
4533 offset = get_ssa_temp(ctx, instr->src[1].ssa);
4534
4535 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4536 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4537
4538 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4539
4540 aco_opcode op32, op64;
4541 switch (instr->intrinsic) {
4542 case nir_intrinsic_ssbo_atomic_add:
4543 op32 = aco_opcode::buffer_atomic_add;
4544 op64 = aco_opcode::buffer_atomic_add_x2;
4545 break;
4546 case nir_intrinsic_ssbo_atomic_imin:
4547 op32 = aco_opcode::buffer_atomic_smin;
4548 op64 = aco_opcode::buffer_atomic_smin_x2;
4549 break;
4550 case nir_intrinsic_ssbo_atomic_umin:
4551 op32 = aco_opcode::buffer_atomic_umin;
4552 op64 = aco_opcode::buffer_atomic_umin_x2;
4553 break;
4554 case nir_intrinsic_ssbo_atomic_imax:
4555 op32 = aco_opcode::buffer_atomic_smax;
4556 op64 = aco_opcode::buffer_atomic_smax_x2;
4557 break;
4558 case nir_intrinsic_ssbo_atomic_umax:
4559 op32 = aco_opcode::buffer_atomic_umax;
4560 op64 = aco_opcode::buffer_atomic_umax_x2;
4561 break;
4562 case nir_intrinsic_ssbo_atomic_and:
4563 op32 = aco_opcode::buffer_atomic_and;
4564 op64 = aco_opcode::buffer_atomic_and_x2;
4565 break;
4566 case nir_intrinsic_ssbo_atomic_or:
4567 op32 = aco_opcode::buffer_atomic_or;
4568 op64 = aco_opcode::buffer_atomic_or_x2;
4569 break;
4570 case nir_intrinsic_ssbo_atomic_xor:
4571 op32 = aco_opcode::buffer_atomic_xor;
4572 op64 = aco_opcode::buffer_atomic_xor_x2;
4573 break;
4574 case nir_intrinsic_ssbo_atomic_exchange:
4575 op32 = aco_opcode::buffer_atomic_swap;
4576 op64 = aco_opcode::buffer_atomic_swap_x2;
4577 break;
4578 case nir_intrinsic_ssbo_atomic_comp_swap:
4579 op32 = aco_opcode::buffer_atomic_cmpswap;
4580 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4581 break;
4582 default:
4583 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4584 }
4585 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4586 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4587 mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4588 mubuf->operands[1] = Operand(rsrc);
4589 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4590 mubuf->operands[3] = Operand(data);
4591 if (return_previous)
4592 mubuf->definitions[0] = Definition(dst);
4593 mubuf->offset = 0;
4594 mubuf->offen = (offset.type() == RegType::vgpr);
4595 mubuf->glc = return_previous;
4596 mubuf->dlc = false; /* Not needed for atomics */
4597 mubuf->disable_wqm = true;
4598 mubuf->barrier = barrier_buffer;
4599 ctx->program->needs_exact = true;
4600 ctx->block->instructions.emplace_back(std::move(mubuf));
4601 }
4602
4603 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4604
4605 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4606 Builder bld(ctx->program, ctx->block);
4607 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4608 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4609 }
4610
4611 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4612 {
4613 Builder bld(ctx->program, ctx->block);
4614 unsigned num_components = instr->num_components;
4615 unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4616
4617 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4618 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4619
4620 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4621 bool dlc = glc && ctx->options->chip_class >= GFX10;
4622 aco_opcode op;
4623 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4624 bool global = ctx->options->chip_class >= GFX9;
4625 aco_opcode op;
4626 switch (num_bytes) {
4627 case 4:
4628 op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4629 break;
4630 case 8:
4631 op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4632 break;
4633 case 12:
4634 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4635 break;
4636 case 16:
4637 op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4638 break;
4639 default:
4640 unreachable("load_global not implemented for this size.");
4641 }
4642 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4643 flat->operands[0] = Operand(addr);
4644 flat->operands[1] = Operand(s1);
4645 flat->glc = glc;
4646 flat->dlc = dlc;
4647 flat->barrier = barrier_buffer;
4648
4649 if (dst.type() == RegType::sgpr) {
4650 Temp vec = bld.tmp(RegType::vgpr, dst.size());
4651 flat->definitions[0] = Definition(vec);
4652 ctx->block->instructions.emplace_back(std::move(flat));
4653 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4654 } else {
4655 flat->definitions[0] = Definition(dst);
4656 ctx->block->instructions.emplace_back(std::move(flat));
4657 }
4658 emit_split_vector(ctx, dst, num_components);
4659 } else {
4660 switch (num_bytes) {
4661 case 4:
4662 op = aco_opcode::s_load_dword;
4663 break;
4664 case 8:
4665 op = aco_opcode::s_load_dwordx2;
4666 break;
4667 case 12:
4668 case 16:
4669 op = aco_opcode::s_load_dwordx4;
4670 break;
4671 default:
4672 unreachable("load_global not implemented for this size.");
4673 }
4674 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4675 load->operands[0] = Operand(addr);
4676 load->operands[1] = Operand(0u);
4677 load->definitions[0] = Definition(dst);
4678 load->glc = glc;
4679 load->dlc = dlc;
4680 load->barrier = barrier_buffer;
4681 assert(ctx->options->chip_class >= GFX8 || !glc);
4682
4683 if (dst.size() == 3) {
4684 /* trim vector */
4685 Temp vec = bld.tmp(s4);
4686 load->definitions[0] = Definition(vec);
4687 ctx->block->instructions.emplace_back(std::move(load));
4688 emit_split_vector(ctx, vec, 4);
4689
4690 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4691 emit_extract_vector(ctx, vec, 0, s1),
4692 emit_extract_vector(ctx, vec, 1, s1),
4693 emit_extract_vector(ctx, vec, 2, s1));
4694 } else {
4695 ctx->block->instructions.emplace_back(std::move(load));
4696 }
4697 }
4698 }
4699
4700 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4701 {
4702 Builder bld(ctx->program, ctx->block);
4703 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4704
4705 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4706 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4707
4708 unsigned writemask = nir_intrinsic_write_mask(instr);
4709 while (writemask) {
4710 int start, count;
4711 u_bit_scan_consecutive_range(&writemask, &start, &count);
4712 unsigned num_bytes = count * elem_size_bytes;
4713
4714 Temp write_data = data;
4715 if (count != instr->num_components) {
4716 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4717 for (int i = 0; i < count; i++)
4718 vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4719 write_data = bld.tmp(RegType::vgpr, count);
4720 vec->definitions[0] = Definition(write_data);
4721 ctx->block->instructions.emplace_back(std::move(vec));
4722 }
4723
4724 unsigned offset = start * elem_size_bytes;
4725 if (offset > 0 && ctx->options->chip_class < GFX9) {
4726 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4727 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4728 Temp carry = bld.tmp(s2);
4729 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4730
4731 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4732 Operand(offset), addr0);
4733 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4734 Operand(0u), addr1,
4735 carry).def(1).setHint(vcc);
4736
4737 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4738
4739 offset = 0;
4740 }
4741
4742 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4743 bool global = ctx->options->chip_class >= GFX9;
4744 aco_opcode op;
4745 switch (num_bytes) {
4746 case 4:
4747 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4748 break;
4749 case 8:
4750 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4751 break;
4752 case 12:
4753 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4754 break;
4755 case 16:
4756 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4757 break;
4758 default:
4759 unreachable("store_global not implemented for this size.");
4760 }
4761 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4762 flat->operands[0] = Operand(addr);
4763 flat->operands[1] = Operand(s1);
4764 flat->operands[2] = Operand(data);
4765 flat->glc = glc;
4766 flat->dlc = false;
4767 flat->offset = offset;
4768 flat->disable_wqm = true;
4769 flat->barrier = barrier_buffer;
4770 ctx->program->needs_exact = true;
4771 ctx->block->instructions.emplace_back(std::move(flat));
4772 }
4773 }
4774
4775 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4776 {
4777 /* return the previous value if dest is ever used */
4778 bool return_previous = false;
4779 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4780 return_previous = true;
4781 break;
4782 }
4783 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4784 return_previous = true;
4785 break;
4786 }
4787
4788 Builder bld(ctx->program, ctx->block);
4789 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4790 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4791
4792 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
4793 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4794 get_ssa_temp(ctx, instr->src[2].ssa), data);
4795
4796 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4797
4798 bool global = ctx->options->chip_class >= GFX9;
4799 aco_opcode op32, op64;
4800 switch (instr->intrinsic) {
4801 case nir_intrinsic_global_atomic_add:
4802 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
4803 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
4804 break;
4805 case nir_intrinsic_global_atomic_imin:
4806 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
4807 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
4808 break;
4809 case nir_intrinsic_global_atomic_umin:
4810 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
4811 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
4812 break;
4813 case nir_intrinsic_global_atomic_imax:
4814 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
4815 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
4816 break;
4817 case nir_intrinsic_global_atomic_umax:
4818 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
4819 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
4820 break;
4821 case nir_intrinsic_global_atomic_and:
4822 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
4823 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
4824 break;
4825 case nir_intrinsic_global_atomic_or:
4826 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
4827 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
4828 break;
4829 case nir_intrinsic_global_atomic_xor:
4830 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
4831 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
4832 break;
4833 case nir_intrinsic_global_atomic_exchange:
4834 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
4835 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
4836 break;
4837 case nir_intrinsic_global_atomic_comp_swap:
4838 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
4839 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
4840 break;
4841 default:
4842 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
4843 }
4844 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4845 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
4846 flat->operands[0] = Operand(addr);
4847 flat->operands[1] = Operand(s1);
4848 flat->operands[2] = Operand(data);
4849 if (return_previous)
4850 flat->definitions[0] = Definition(dst);
4851 flat->glc = return_previous;
4852 flat->dlc = false; /* Not needed for atomics */
4853 flat->offset = 0;
4854 flat->disable_wqm = true;
4855 flat->barrier = barrier_buffer;
4856 ctx->program->needs_exact = true;
4857 ctx->block->instructions.emplace_back(std::move(flat));
4858 }
4859
4860 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4861 Builder bld(ctx->program, ctx->block);
4862 switch(instr->intrinsic) {
4863 case nir_intrinsic_group_memory_barrier:
4864 case nir_intrinsic_memory_barrier:
4865 bld.barrier(aco_opcode::p_memory_barrier_all);
4866 break;
4867 case nir_intrinsic_memory_barrier_atomic_counter:
4868 bld.barrier(aco_opcode::p_memory_barrier_atomic);
4869 break;
4870 case nir_intrinsic_memory_barrier_buffer:
4871 bld.barrier(aco_opcode::p_memory_barrier_buffer);
4872 break;
4873 case nir_intrinsic_memory_barrier_image:
4874 bld.barrier(aco_opcode::p_memory_barrier_image);
4875 break;
4876 case nir_intrinsic_memory_barrier_shared:
4877 bld.barrier(aco_opcode::p_memory_barrier_shared);
4878 break;
4879 default:
4880 unreachable("Unimplemented memory barrier intrinsic");
4881 break;
4882 }
4883 }
4884
4885 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4886 {
4887 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4888 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4889 assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4890 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4891 Builder bld(ctx->program, ctx->block);
4892
4893 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4894 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4895 load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
4896 }
4897
4898 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4899 {
4900 unsigned writemask = nir_intrinsic_write_mask(instr);
4901 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4902 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4903 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4904 assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4905
4906 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4907 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
4908 }
4909
4910 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4911 {
4912 unsigned offset = nir_intrinsic_base(instr);
4913 Operand m = load_lds_size_m0(ctx);
4914 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4915 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4916
4917 unsigned num_operands = 3;
4918 aco_opcode op32, op64, op32_rtn, op64_rtn;
4919 switch(instr->intrinsic) {
4920 case nir_intrinsic_shared_atomic_add:
4921 op32 = aco_opcode::ds_add_u32;
4922 op64 = aco_opcode::ds_add_u64;
4923 op32_rtn = aco_opcode::ds_add_rtn_u32;
4924 op64_rtn = aco_opcode::ds_add_rtn_u64;
4925 break;
4926 case nir_intrinsic_shared_atomic_imin:
4927 op32 = aco_opcode::ds_min_i32;
4928 op64 = aco_opcode::ds_min_i64;
4929 op32_rtn = aco_opcode::ds_min_rtn_i32;
4930 op64_rtn = aco_opcode::ds_min_rtn_i64;
4931 break;
4932 case nir_intrinsic_shared_atomic_umin:
4933 op32 = aco_opcode::ds_min_u32;
4934 op64 = aco_opcode::ds_min_u64;
4935 op32_rtn = aco_opcode::ds_min_rtn_u32;
4936 op64_rtn = aco_opcode::ds_min_rtn_u64;
4937 break;
4938 case nir_intrinsic_shared_atomic_imax:
4939 op32 = aco_opcode::ds_max_i32;
4940 op64 = aco_opcode::ds_max_i64;
4941 op32_rtn = aco_opcode::ds_max_rtn_i32;
4942 op64_rtn = aco_opcode::ds_max_rtn_i64;
4943 break;
4944 case nir_intrinsic_shared_atomic_umax:
4945 op32 = aco_opcode::ds_max_u32;
4946 op64 = aco_opcode::ds_max_u64;
4947 op32_rtn = aco_opcode::ds_max_rtn_u32;
4948 op64_rtn = aco_opcode::ds_max_rtn_u64;
4949 break;
4950 case nir_intrinsic_shared_atomic_and:
4951 op32 = aco_opcode::ds_and_b32;
4952 op64 = aco_opcode::ds_and_b64;
4953 op32_rtn = aco_opcode::ds_and_rtn_b32;
4954 op64_rtn = aco_opcode::ds_and_rtn_b64;
4955 break;
4956 case nir_intrinsic_shared_atomic_or:
4957 op32 = aco_opcode::ds_or_b32;
4958 op64 = aco_opcode::ds_or_b64;
4959 op32_rtn = aco_opcode::ds_or_rtn_b32;
4960 op64_rtn = aco_opcode::ds_or_rtn_b64;
4961 break;
4962 case nir_intrinsic_shared_atomic_xor:
4963 op32 = aco_opcode::ds_xor_b32;
4964 op64 = aco_opcode::ds_xor_b64;
4965 op32_rtn = aco_opcode::ds_xor_rtn_b32;
4966 op64_rtn = aco_opcode::ds_xor_rtn_b64;
4967 break;
4968 case nir_intrinsic_shared_atomic_exchange:
4969 op32 = aco_opcode::ds_write_b32;
4970 op64 = aco_opcode::ds_write_b64;
4971 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4972 op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4973 break;
4974 case nir_intrinsic_shared_atomic_comp_swap:
4975 op32 = aco_opcode::ds_cmpst_b32;
4976 op64 = aco_opcode::ds_cmpst_b64;
4977 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4978 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4979 num_operands = 4;
4980 break;
4981 default:
4982 unreachable("Unhandled shared atomic intrinsic");
4983 }
4984
4985 /* return the previous value if dest is ever used */
4986 bool return_previous = false;
4987 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4988 return_previous = true;
4989 break;
4990 }
4991 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4992 return_previous = true;
4993 break;
4994 }
4995
4996 aco_opcode op;
4997 if (data.size() == 1) {
4998 assert(instr->dest.ssa.bit_size == 32);
4999 op = return_previous ? op32_rtn : op32;
5000 } else {
5001 assert(instr->dest.ssa.bit_size == 64);
5002 op = return_previous ? op64_rtn : op64;
5003 }
5004
5005 if (offset > 65535) {
5006 Builder bld(ctx->program, ctx->block);
5007 address = bld.vadd32(bld.def(v1), Operand(offset), address);
5008 offset = 0;
5009 }
5010
5011 aco_ptr<DS_instruction> ds;
5012 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
5013 ds->operands[0] = Operand(address);
5014 ds->operands[1] = Operand(data);
5015 if (num_operands == 4)
5016 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
5017 ds->operands[num_operands - 1] = m;
5018 ds->offset0 = offset;
5019 if (return_previous)
5020 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
5021 ctx->block->instructions.emplace_back(std::move(ds));
5022 }
5023
5024 Temp get_scratch_resource(isel_context *ctx)
5025 {
5026 Builder bld(ctx->program, ctx->block);
5027 Temp scratch_addr = ctx->program->private_segment_buffer;
5028 if (ctx->stage != compute_cs)
5029 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
5030
5031 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
5032 S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
5033
5034 if (ctx->program->chip_class >= GFX10) {
5035 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5036 S_008F0C_OOB_SELECT(3) |
5037 S_008F0C_RESOURCE_LEVEL(1);
5038 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
5039 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5040 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5041 }
5042
5043 /* older generations need element size = 16 bytes. element size removed in GFX9 */
5044 if (ctx->program->chip_class <= GFX8)
5045 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
5046
5047 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
5048 }
5049
5050 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5051 assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
5052 Builder bld(ctx->program, ctx->block);
5053 Temp rsrc = get_scratch_resource(ctx);
5054 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5055 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5056
5057 aco_opcode op;
5058 switch (dst.size()) {
5059 case 1:
5060 op = aco_opcode::buffer_load_dword;
5061 break;
5062 case 2:
5063 op = aco_opcode::buffer_load_dwordx2;
5064 break;
5065 case 3:
5066 op = aco_opcode::buffer_load_dwordx3;
5067 break;
5068 case 4:
5069 op = aco_opcode::buffer_load_dwordx4;
5070 break;
5071 case 6:
5072 case 8: {
5073 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5074 Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
5075 bld.def(v4), offset, rsrc,
5076 ctx->program->scratch_offset, 0, true);
5077 Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
5078 aco_opcode::buffer_load_dwordx4,
5079 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
5080 offset, rsrc, ctx->program->scratch_offset, 16, true);
5081 emit_split_vector(ctx, lower, 2);
5082 elems[0] = emit_extract_vector(ctx, lower, 0, v2);
5083 elems[1] = emit_extract_vector(ctx, lower, 1, v2);
5084 if (dst.size() == 8) {
5085 emit_split_vector(ctx, upper, 2);
5086 elems[2] = emit_extract_vector(ctx, upper, 0, v2);
5087 elems[3] = emit_extract_vector(ctx, upper, 1, v2);
5088 } else {
5089 elems[2] = upper;
5090 }
5091
5092 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
5093 Format::PSEUDO, dst.size() / 2, 1)};
5094 for (unsigned i = 0; i < dst.size() / 2; i++)
5095 vec->operands[i] = Operand(elems[i]);
5096 vec->definitions[0] = Definition(dst);
5097 bld.insert(std::move(vec));
5098 ctx->allocated_vec.emplace(dst.id(), elems);
5099 return;
5100 }
5101 default:
5102 unreachable("Wrong dst size for nir_intrinsic_load_scratch");
5103 }
5104
5105 bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
5106 emit_split_vector(ctx, dst, instr->num_components);
5107 }
5108
5109 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5110 assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
5111 Builder bld(ctx->program, ctx->block);
5112 Temp rsrc = get_scratch_resource(ctx);
5113 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5114 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5115
5116 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5117 unsigned writemask = nir_intrinsic_write_mask(instr);
5118
5119 while (writemask) {
5120 int start, count;
5121 u_bit_scan_consecutive_range(&writemask, &start, &count);
5122 int num_bytes = count * elem_size_bytes;
5123
5124 if (num_bytes > 16) {
5125 assert(elem_size_bytes == 8);
5126 writemask |= (((count - 2) << 1) - 1) << (start + 2);
5127 count = 2;
5128 num_bytes = 16;
5129 }
5130
5131 // TODO: check alignment of sub-dword stores
5132 // TODO: split 3 bytes. there is no store instruction for that
5133
5134 Temp write_data;
5135 if (count != instr->num_components) {
5136 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5137 for (int i = 0; i < count; i++) {
5138 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
5139 vec->operands[i] = Operand(elem);
5140 }
5141 write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
5142 vec->definitions[0] = Definition(write_data);
5143 ctx->block->instructions.emplace_back(std::move(vec));
5144 } else {
5145 write_data = data;
5146 }
5147
5148 aco_opcode op;
5149 switch (num_bytes) {
5150 case 4:
5151 op = aco_opcode::buffer_store_dword;
5152 break;
5153 case 8:
5154 op = aco_opcode::buffer_store_dwordx2;
5155 break;
5156 case 12:
5157 op = aco_opcode::buffer_store_dwordx3;
5158 break;
5159 case 16:
5160 op = aco_opcode::buffer_store_dwordx4;
5161 break;
5162 default:
5163 unreachable("Invalid data size for nir_intrinsic_store_scratch.");
5164 }
5165
5166 bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
5167 }
5168 }
5169
5170 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
5171 uint8_t log2_ps_iter_samples;
5172 if (ctx->program->info->ps.force_persample) {
5173 log2_ps_iter_samples =
5174 util_logbase2(ctx->options->key.fs.num_samples);
5175 } else {
5176 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
5177 }
5178
5179 /* The bit pattern matches that used by fixed function fragment
5180 * processing. */
5181 static const unsigned ps_iter_masks[] = {
5182 0xffff, /* not used */
5183 0x5555,
5184 0x1111,
5185 0x0101,
5186 0x0001,
5187 };
5188 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
5189
5190 Builder bld(ctx->program, ctx->block);
5191
5192 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5193 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5194 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
5195 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
5196 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5197 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
5198 }
5199
5200 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
5201 {
5202 Builder bld(ctx->program, ctx->block);
5203
5204 if (cluster_size == 1) {
5205 return src;
5206 } if (op == nir_op_iand && cluster_size == 4) {
5207 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5208 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5209 return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
5210 bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
5211 } else if (op == nir_op_ior && cluster_size == 4) {
5212 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5213 return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5214 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5215 } else if (op == nir_op_iand && cluster_size == 64) {
5216 //subgroupAnd(val) -> (exec & ~val) == 0
5217 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5218 return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
5219 } else if (op == nir_op_ior && cluster_size == 64) {
5220 //subgroupOr(val) -> (val & exec) != 0
5221 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5222 return bool_to_vector_condition(ctx, tmp);
5223 } else if (op == nir_op_ixor && cluster_size == 64) {
5224 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5225 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5226 tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5227 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5228 return bool_to_vector_condition(ctx, tmp);
5229 } else {
5230 //subgroupClustered{And,Or,Xor}(val, n) ->
5231 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5232 //cluster_offset = ~(n - 1) & lane_id
5233 //cluster_mask = ((1 << n) - 1)
5234 //subgroupClusteredAnd():
5235 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5236 //subgroupClusteredOr():
5237 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
5238 //subgroupClusteredXor():
5239 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5240 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5241 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5242 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5243
5244 Temp tmp;
5245 if (op == nir_op_iand)
5246 tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5247 else
5248 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5249
5250 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5251 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5252 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5253 if (cluster_mask != 0xffffffff)
5254 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5255
5256 Definition cmp_def = Definition();
5257 if (op == nir_op_iand) {
5258 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5259 } else if (op == nir_op_ior) {
5260 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5261 } else if (op == nir_op_ixor) {
5262 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5263 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5264 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5265 }
5266 cmp_def.setHint(vcc);
5267 return cmp_def.getTemp();
5268 }
5269 }
5270
5271 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5272 {
5273 Builder bld(ctx->program, ctx->block);
5274
5275 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5276 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5277 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5278 Temp tmp;
5279 if (op == nir_op_iand)
5280 tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5281 else
5282 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5283
5284 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5285 Temp lo = lohi.def(0).getTemp();
5286 Temp hi = lohi.def(1).getTemp();
5287 Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5288 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5289
5290 Definition cmp_def = Definition();
5291 if (op == nir_op_iand)
5292 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5293 else if (op == nir_op_ior)
5294 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5295 else if (op == nir_op_ixor)
5296 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5297 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5298 cmp_def.setHint(vcc);
5299 return cmp_def.getTemp();
5300 }
5301
5302 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5303 {
5304 Builder bld(ctx->program, ctx->block);
5305
5306 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5307 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5308 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5309 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5310 if (op == nir_op_iand)
5311 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5312 else if (op == nir_op_ior)
5313 return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5314 else if (op == nir_op_ixor)
5315 return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5316
5317 assert(false);
5318 return Temp();
5319 }
5320
5321 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5322 {
5323 Builder bld(ctx->program, ctx->block);
5324 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5325 if (src.regClass().type() == RegType::vgpr) {
5326 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5327 } else if (src.regClass() == s1) {
5328 bld.sop1(aco_opcode::s_mov_b32, dst, src);
5329 } else if (src.regClass() == s2) {
5330 bld.sop1(aco_opcode::s_mov_b64, dst, src);
5331 } else {
5332 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5333 nir_print_instr(&instr->instr, stderr);
5334 fprintf(stderr, "\n");
5335 }
5336 }
5337
5338 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5339 {
5340 Builder bld(ctx->program, ctx->block);
5341 Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
5342 Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
5343 Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
5344
5345 /* Build DD X/Y */
5346 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5347 Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5348 Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5349 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5350 Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5351 Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5352
5353 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5354 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5355 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5356 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5357 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5358 Temp wqm1 = bld.tmp(v1);
5359 emit_wqm(ctx, tmp1, wqm1, true);
5360 Temp wqm2 = bld.tmp(v1);
5361 emit_wqm(ctx, tmp2, wqm2, true);
5362 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5363 return;
5364 }
5365
5366 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5367 {
5368 Builder bld(ctx->program, ctx->block);
5369 switch(instr->intrinsic) {
5370 case nir_intrinsic_load_barycentric_sample:
5371 case nir_intrinsic_load_barycentric_pixel:
5372 case nir_intrinsic_load_barycentric_centroid: {
5373 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5374 Temp bary = Temp(0, s2);
5375 switch (mode) {
5376 case INTERP_MODE_SMOOTH:
5377 case INTERP_MODE_NONE:
5378 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5379 bary = get_arg(ctx, ctx->args->ac.persp_center);
5380 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5381 bary = ctx->persp_centroid;
5382 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5383 bary = get_arg(ctx, ctx->args->ac.persp_sample);
5384 break;
5385 case INTERP_MODE_NOPERSPECTIVE:
5386 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5387 bary = get_arg(ctx, ctx->args->ac.linear_center);
5388 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5389 bary = ctx->linear_centroid;
5390 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5391 bary = get_arg(ctx, ctx->args->ac.linear_sample);
5392 break;
5393 default:
5394 break;
5395 }
5396 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5397 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
5398 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
5399 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5400 Operand(p1), Operand(p2));
5401 emit_split_vector(ctx, dst, 2);
5402 break;
5403 }
5404 case nir_intrinsic_load_barycentric_at_sample: {
5405 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5406 switch (ctx->options->key.fs.num_samples) {
5407 case 2: sample_pos_offset += 1 << 3; break;
5408 case 4: sample_pos_offset += 3 << 3; break;
5409 case 8: sample_pos_offset += 7 << 3; break;
5410 default: break;
5411 }
5412 Temp sample_pos;
5413 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5414 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5415 Temp private_segment_buffer = ctx->program->private_segment_buffer;
5416 if (addr.type() == RegType::sgpr) {
5417 Operand offset;
5418 if (const_addr) {
5419 sample_pos_offset += const_addr->u32 << 3;
5420 offset = Operand(sample_pos_offset);
5421 } else if (ctx->options->chip_class >= GFX9) {
5422 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5423 } else {
5424 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5425 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5426 }
5427 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset));
5428
5429 } else if (ctx->options->chip_class >= GFX9) {
5430 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5431 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
5432 } else {
5433 /* addr += private_segment_buffer + sample_pos_offset */
5434 Temp tmp0 = bld.tmp(s1);
5435 Temp tmp1 = bld.tmp(s1);
5436 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
5437 Definition scc_tmp = bld.def(s1, scc);
5438 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5439 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
5440 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5441 Temp pck0 = bld.tmp(v1);
5442 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5443 tmp1 = as_vgpr(ctx, tmp1);
5444 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5445 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5446
5447 /* sample_pos = flat_load_dwordx2 addr */
5448 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5449 }
5450
5451 /* sample_pos -= 0.5 */
5452 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5453 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5454 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5455 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5456 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5457
5458 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5459 break;
5460 }
5461 case nir_intrinsic_load_barycentric_at_offset: {
5462 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5463 RegClass rc = RegClass(offset.type(), 1);
5464 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5465 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5466 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5467 break;
5468 }
5469 case nir_intrinsic_load_front_face: {
5470 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5471 Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
5472 break;
5473 }
5474 case nir_intrinsic_load_view_index:
5475 case nir_intrinsic_load_layer_id: {
5476 if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5477 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5478 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
5479 break;
5480 }
5481
5482 unsigned idx = nir_intrinsic_base(instr);
5483 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5484 Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
5485 break;
5486 }
5487 case nir_intrinsic_load_frag_coord: {
5488 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5489 break;
5490 }
5491 case nir_intrinsic_load_sample_pos: {
5492 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
5493 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
5494 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5495 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5496 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5497 break;
5498 }
5499 case nir_intrinsic_load_interpolated_input:
5500 visit_load_interpolated_input(ctx, instr);
5501 break;
5502 case nir_intrinsic_store_output:
5503 visit_store_output(ctx, instr);
5504 break;
5505 case nir_intrinsic_load_input:
5506 visit_load_input(ctx, instr);
5507 break;
5508 case nir_intrinsic_load_ubo:
5509 visit_load_ubo(ctx, instr);
5510 break;
5511 case nir_intrinsic_load_push_constant:
5512 visit_load_push_constant(ctx, instr);
5513 break;
5514 case nir_intrinsic_load_constant:
5515 visit_load_constant(ctx, instr);
5516 break;
5517 case nir_intrinsic_vulkan_resource_index:
5518 visit_load_resource(ctx, instr);
5519 break;
5520 case nir_intrinsic_discard:
5521 visit_discard(ctx, instr);
5522 break;
5523 case nir_intrinsic_discard_if:
5524 visit_discard_if(ctx, instr);
5525 break;
5526 case nir_intrinsic_load_shared:
5527 visit_load_shared(ctx, instr);
5528 break;
5529 case nir_intrinsic_store_shared:
5530 visit_store_shared(ctx, instr);
5531 break;
5532 case nir_intrinsic_shared_atomic_add:
5533 case nir_intrinsic_shared_atomic_imin:
5534 case nir_intrinsic_shared_atomic_umin:
5535 case nir_intrinsic_shared_atomic_imax:
5536 case nir_intrinsic_shared_atomic_umax:
5537 case nir_intrinsic_shared_atomic_and:
5538 case nir_intrinsic_shared_atomic_or:
5539 case nir_intrinsic_shared_atomic_xor:
5540 case nir_intrinsic_shared_atomic_exchange:
5541 case nir_intrinsic_shared_atomic_comp_swap:
5542 visit_shared_atomic(ctx, instr);
5543 break;
5544 case nir_intrinsic_image_deref_load:
5545 visit_image_load(ctx, instr);
5546 break;
5547 case nir_intrinsic_image_deref_store:
5548 visit_image_store(ctx, instr);
5549 break;
5550 case nir_intrinsic_image_deref_atomic_add:
5551 case nir_intrinsic_image_deref_atomic_umin:
5552 case nir_intrinsic_image_deref_atomic_imin:
5553 case nir_intrinsic_image_deref_atomic_umax:
5554 case nir_intrinsic_image_deref_atomic_imax:
5555 case nir_intrinsic_image_deref_atomic_and:
5556 case nir_intrinsic_image_deref_atomic_or:
5557 case nir_intrinsic_image_deref_atomic_xor:
5558 case nir_intrinsic_image_deref_atomic_exchange:
5559 case nir_intrinsic_image_deref_atomic_comp_swap:
5560 visit_image_atomic(ctx, instr);
5561 break;
5562 case nir_intrinsic_image_deref_size:
5563 visit_image_size(ctx, instr);
5564 break;
5565 case nir_intrinsic_load_ssbo:
5566 visit_load_ssbo(ctx, instr);
5567 break;
5568 case nir_intrinsic_store_ssbo:
5569 visit_store_ssbo(ctx, instr);
5570 break;
5571 case nir_intrinsic_load_global:
5572 visit_load_global(ctx, instr);
5573 break;
5574 case nir_intrinsic_store_global:
5575 visit_store_global(ctx, instr);
5576 break;
5577 case nir_intrinsic_global_atomic_add:
5578 case nir_intrinsic_global_atomic_imin:
5579 case nir_intrinsic_global_atomic_umin:
5580 case nir_intrinsic_global_atomic_imax:
5581 case nir_intrinsic_global_atomic_umax:
5582 case nir_intrinsic_global_atomic_and:
5583 case nir_intrinsic_global_atomic_or:
5584 case nir_intrinsic_global_atomic_xor:
5585 case nir_intrinsic_global_atomic_exchange:
5586 case nir_intrinsic_global_atomic_comp_swap:
5587 visit_global_atomic(ctx, instr);
5588 break;
5589 case nir_intrinsic_ssbo_atomic_add:
5590 case nir_intrinsic_ssbo_atomic_imin:
5591 case nir_intrinsic_ssbo_atomic_umin:
5592 case nir_intrinsic_ssbo_atomic_imax:
5593 case nir_intrinsic_ssbo_atomic_umax:
5594 case nir_intrinsic_ssbo_atomic_and:
5595 case nir_intrinsic_ssbo_atomic_or:
5596 case nir_intrinsic_ssbo_atomic_xor:
5597 case nir_intrinsic_ssbo_atomic_exchange:
5598 case nir_intrinsic_ssbo_atomic_comp_swap:
5599 visit_atomic_ssbo(ctx, instr);
5600 break;
5601 case nir_intrinsic_load_scratch:
5602 visit_load_scratch(ctx, instr);
5603 break;
5604 case nir_intrinsic_store_scratch:
5605 visit_store_scratch(ctx, instr);
5606 break;
5607 case nir_intrinsic_get_buffer_size:
5608 visit_get_buffer_size(ctx, instr);
5609 break;
5610 case nir_intrinsic_barrier: {
5611 unsigned* bsize = ctx->program->info->cs.block_size;
5612 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5613 if (workgroup_size > 64)
5614 bld.sopp(aco_opcode::s_barrier);
5615 break;
5616 }
5617 case nir_intrinsic_group_memory_barrier:
5618 case nir_intrinsic_memory_barrier:
5619 case nir_intrinsic_memory_barrier_atomic_counter:
5620 case nir_intrinsic_memory_barrier_buffer:
5621 case nir_intrinsic_memory_barrier_image:
5622 case nir_intrinsic_memory_barrier_shared:
5623 emit_memory_barrier(ctx, instr);
5624 break;
5625 case nir_intrinsic_load_num_work_groups: {
5626 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5627 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
5628 emit_split_vector(ctx, dst, 3);
5629 break;
5630 }
5631 case nir_intrinsic_load_local_invocation_id: {
5632 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5633 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
5634 emit_split_vector(ctx, dst, 3);
5635 break;
5636 }
5637 case nir_intrinsic_load_work_group_id: {
5638 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5639 struct ac_arg *args = ctx->args->ac.workgroup_ids;
5640 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5641 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
5642 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
5643 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
5644 emit_split_vector(ctx, dst, 3);
5645 break;
5646 }
5647 case nir_intrinsic_load_local_invocation_index: {
5648 Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5649 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5650 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
5651 get_arg(ctx, ctx->args->ac.tg_size));
5652 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5653 break;
5654 }
5655 case nir_intrinsic_load_subgroup_id: {
5656 if (ctx->stage == compute_cs) {
5657 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
5658 get_arg(ctx, ctx->args->ac.tg_size));
5659 bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5660 } else {
5661 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5662 }
5663 break;
5664 }
5665 case nir_intrinsic_load_subgroup_invocation: {
5666 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5667 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5668 break;
5669 }
5670 case nir_intrinsic_load_num_subgroups: {
5671 if (ctx->stage == compute_cs)
5672 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
5673 get_arg(ctx, ctx->args->ac.tg_size));
5674 else
5675 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5676 break;
5677 }
5678 case nir_intrinsic_ballot: {
5679 Definition tmp = bld.def(s2);
5680 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5681 if (instr->src[0].ssa->bit_size == 1) {
5682 assert(src.regClass() == s2);
5683 bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5684 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5685 bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5686 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5687 bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5688 } else {
5689 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5690 nir_print_instr(&instr->instr, stderr);
5691 fprintf(stderr, "\n");
5692 }
5693 emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5694 break;
5695 }
5696 case nir_intrinsic_shuffle:
5697 case nir_intrinsic_read_invocation: {
5698 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5699 if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
5700 emit_uniform_subgroup(ctx, instr, src);
5701 } else {
5702 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5703 if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
5704 tid = bld.as_uniform(tid);
5705 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5706 if (src.regClass() == v1) {
5707 emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
5708 } else if (src.regClass() == v2) {
5709 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5710 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5711 lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
5712 hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
5713 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5714 emit_split_vector(ctx, dst, 2);
5715 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
5716 assert(src.regClass() == s2);
5717 Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid);
5718 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5719 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
5720 assert(src.regClass() == s2);
5721 Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5722 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5723 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5724 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5725 } else {
5726 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5727 nir_print_instr(&instr->instr, stderr);
5728 fprintf(stderr, "\n");
5729 }
5730 }
5731 break;
5732 }
5733 case nir_intrinsic_load_sample_id: {
5734 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5735 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5736 break;
5737 }
5738 case nir_intrinsic_load_sample_mask_in: {
5739 visit_load_sample_mask_in(ctx, instr);
5740 break;
5741 }
5742 case nir_intrinsic_read_first_invocation: {
5743 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5744 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5745 if (src.regClass() == v1) {
5746 emit_wqm(ctx,
5747 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5748 dst);
5749 } else if (src.regClass() == v2) {
5750 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5751 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5752 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5753 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5754 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5755 emit_split_vector(ctx, dst, 2);
5756 } else if (instr->dest.ssa.bit_size == 1) {
5757 assert(src.regClass() == s2);
5758 Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5759 bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)));
5760 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5761 } else if (src.regClass() == s1) {
5762 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5763 } else if (src.regClass() == s2) {
5764 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5765 } else {
5766 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5767 nir_print_instr(&instr->instr, stderr);
5768 fprintf(stderr, "\n");
5769 }
5770 break;
5771 }
5772 case nir_intrinsic_vote_all: {
5773 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5774 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5775 assert(src.regClass() == s2);
5776 assert(dst.regClass() == s2);
5777
5778 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5779 Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
5780 emit_wqm(ctx, val, dst);
5781 break;
5782 }
5783 case nir_intrinsic_vote_any: {
5784 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5785 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5786 assert(src.regClass() == s2);
5787 assert(dst.regClass() == s2);
5788
5789 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5790 Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp));
5791 emit_wqm(ctx, val, dst);
5792 break;
5793 }
5794 case nir_intrinsic_reduce:
5795 case nir_intrinsic_inclusive_scan:
5796 case nir_intrinsic_exclusive_scan: {
5797 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5798 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5799 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5800 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5801 nir_intrinsic_cluster_size(instr) : 0;
5802 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5803
5804 if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5805 emit_uniform_subgroup(ctx, instr, src);
5806 } else if (instr->dest.ssa.bit_size == 1) {
5807 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5808 op = nir_op_iand;
5809 else if (op == nir_op_iadd)
5810 op = nir_op_ixor;
5811 else if (op == nir_op_umax || op == nir_op_imax)
5812 op = nir_op_ior;
5813 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5814
5815 switch (instr->intrinsic) {
5816 case nir_intrinsic_reduce:
5817 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5818 break;
5819 case nir_intrinsic_exclusive_scan:
5820 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5821 break;
5822 case nir_intrinsic_inclusive_scan:
5823 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5824 break;
5825 default:
5826 assert(false);
5827 }
5828 } else if (cluster_size == 1) {
5829 bld.copy(Definition(dst), src);
5830 } else {
5831 src = as_vgpr(ctx, src);
5832
5833 ReduceOp reduce_op;
5834 switch (op) {
5835 #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5836 CASE(iadd)
5837 CASE(imul)
5838 CASE(fadd)
5839 CASE(fmul)
5840 CASE(imin)
5841 CASE(umin)
5842 CASE(fmin)
5843 CASE(imax)
5844 CASE(umax)
5845 CASE(fmax)
5846 CASE(iand)
5847 CASE(ior)
5848 CASE(ixor)
5849 default:
5850 unreachable("unknown reduction op");
5851 #undef CASE
5852 }
5853
5854 aco_opcode aco_op;
5855 switch (instr->intrinsic) {
5856 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5857 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5858 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5859 default:
5860 unreachable("unknown reduce intrinsic");
5861 }
5862
5863 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5864 reduce->operands[0] = Operand(src);
5865 // filled in by aco_reduce_assign.cpp, used internally as part of the
5866 // reduce sequence
5867 assert(dst.size() == 1 || dst.size() == 2);
5868 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5869 reduce->operands[2] = Operand(v1.as_linear());
5870
5871 Temp tmp_dst = bld.tmp(dst.regClass());
5872 reduce->definitions[0] = Definition(tmp_dst);
5873 reduce->definitions[1] = bld.def(s2); // used internally
5874 reduce->definitions[2] = Definition();
5875 reduce->definitions[3] = Definition(scc, s1);
5876 reduce->definitions[4] = Definition();
5877 reduce->reduce_op = reduce_op;
5878 reduce->cluster_size = cluster_size;
5879 ctx->block->instructions.emplace_back(std::move(reduce));
5880
5881 emit_wqm(ctx, tmp_dst, dst);
5882 }
5883 break;
5884 }
5885 case nir_intrinsic_quad_broadcast: {
5886 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5887 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5888 emit_uniform_subgroup(ctx, instr, src);
5889 } else {
5890 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5891 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5892 if (instr->dest.ssa.bit_size == 1) {
5893 assert(src.regClass() == s2);
5894 uint32_t half_mask = 0x11111111u << lane;
5895 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5896 Temp tmp = bld.tmp(s2);
5897 bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5898 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5899 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5900 emit_wqm(ctx, tmp, dst);
5901 } else if (instr->dest.ssa.bit_size == 32) {
5902 emit_wqm(ctx,
5903 bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5904 dpp_quad_perm(lane, lane, lane, lane)),
5905 dst);
5906 } else if (instr->dest.ssa.bit_size == 64) {
5907 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5908 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5909 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5910 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5911 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5912 emit_split_vector(ctx, dst, 2);
5913 } else {
5914 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5915 nir_print_instr(&instr->instr, stderr);
5916 fprintf(stderr, "\n");
5917 }
5918 }
5919 break;
5920 }
5921 case nir_intrinsic_quad_swap_horizontal:
5922 case nir_intrinsic_quad_swap_vertical:
5923 case nir_intrinsic_quad_swap_diagonal:
5924 case nir_intrinsic_quad_swizzle_amd: {
5925 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5926 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5927 emit_uniform_subgroup(ctx, instr, src);
5928 break;
5929 }
5930 uint16_t dpp_ctrl = 0;
5931 switch (instr->intrinsic) {
5932 case nir_intrinsic_quad_swap_horizontal:
5933 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5934 break;
5935 case nir_intrinsic_quad_swap_vertical:
5936 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5937 break;
5938 case nir_intrinsic_quad_swap_diagonal:
5939 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5940 break;
5941 case nir_intrinsic_quad_swizzle_amd: {
5942 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5943 break;
5944 }
5945 default:
5946 break;
5947 }
5948
5949 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5950 if (instr->dest.ssa.bit_size == 1) {
5951 assert(src.regClass() == s2);
5952 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5953 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5954 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5955 emit_wqm(ctx, tmp, dst);
5956 } else if (instr->dest.ssa.bit_size == 32) {
5957 Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5958 emit_wqm(ctx, tmp, dst);
5959 } else if (instr->dest.ssa.bit_size == 64) {
5960 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5961 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5962 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5963 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5964 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5965 emit_split_vector(ctx, dst, 2);
5966 } else {
5967 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5968 nir_print_instr(&instr->instr, stderr);
5969 fprintf(stderr, "\n");
5970 }
5971 break;
5972 }
5973 case nir_intrinsic_masked_swizzle_amd: {
5974 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5975 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5976 emit_uniform_subgroup(ctx, instr, src);
5977 break;
5978 }
5979 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5980 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5981 if (dst.regClass() == v1) {
5982 emit_wqm(ctx,
5983 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5984 dst);
5985 } else if (dst.regClass() == v2) {
5986 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5987 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5988 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5989 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5990 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5991 emit_split_vector(ctx, dst, 2);
5992 } else {
5993 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5994 nir_print_instr(&instr->instr, stderr);
5995 fprintf(stderr, "\n");
5996 }
5997 break;
5998 }
5999 case nir_intrinsic_write_invocation_amd: {
6000 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6001 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6002 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
6003 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6004 if (dst.regClass() == v1) {
6005 /* src2 is ignored for writelane. RA assigns the same reg for dst */
6006 emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
6007 } else if (dst.regClass() == v2) {
6008 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
6009 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
6010 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
6011 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
6012 Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
6013 Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
6014 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6015 emit_split_vector(ctx, dst, 2);
6016 } else {
6017 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6018 nir_print_instr(&instr->instr, stderr);
6019 fprintf(stderr, "\n");
6020 }
6021 break;
6022 }
6023 case nir_intrinsic_mbcnt_amd: {
6024 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6025 RegClass rc = RegClass(src.type(), 1);
6026 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
6027 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
6028 Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
6029 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6030 Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
6031 emit_wqm(ctx, wqm_tmp, dst);
6032 break;
6033 }
6034 case nir_intrinsic_load_helper_invocation: {
6035 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6036 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
6037 ctx->block->kind |= block_kind_needs_lowering;
6038 ctx->program->needs_exact = true;
6039 break;
6040 }
6041 case nir_intrinsic_is_helper_invocation: {
6042 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6043 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
6044 ctx->block->kind |= block_kind_needs_lowering;
6045 ctx->program->needs_exact = true;
6046 break;
6047 }
6048 case nir_intrinsic_demote:
6049 bld.pseudo(aco_opcode::p_demote_to_helper);
6050 ctx->block->kind |= block_kind_uses_demote;
6051 ctx->program->needs_exact = true;
6052 break;
6053 case nir_intrinsic_demote_if: {
6054 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6055 assert(src.regClass() == s2);
6056 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
6057 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
6058 ctx->block->kind |= block_kind_uses_demote;
6059 ctx->program->needs_exact = true;
6060 break;
6061 }
6062 case nir_intrinsic_first_invocation: {
6063 emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
6064 get_ssa_temp(ctx, &instr->dest.ssa));
6065 break;
6066 }
6067 case nir_intrinsic_shader_clock:
6068 bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
6069 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
6070 break;
6071 case nir_intrinsic_load_vertex_id_zero_base: {
6072 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6073 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
6074 break;
6075 }
6076 case nir_intrinsic_load_first_vertex: {
6077 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6078 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
6079 break;
6080 }
6081 case nir_intrinsic_load_base_instance: {
6082 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6083 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
6084 break;
6085 }
6086 case nir_intrinsic_load_instance_id: {
6087 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6088 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
6089 break;
6090 }
6091 case nir_intrinsic_load_draw_id: {
6092 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6093 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
6094 break;
6095 }
6096 default:
6097 fprintf(stderr, "Unimplemented intrinsic instr: ");
6098 nir_print_instr(&instr->instr, stderr);
6099 fprintf(stderr, "\n");
6100 abort();
6101
6102 break;
6103 }
6104 }
6105
6106
6107 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
6108 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
6109 enum glsl_base_type *stype)
6110 {
6111 nir_deref_instr *texture_deref_instr = NULL;
6112 nir_deref_instr *sampler_deref_instr = NULL;
6113 int plane = -1;
6114
6115 for (unsigned i = 0; i < instr->num_srcs; i++) {
6116 switch (instr->src[i].src_type) {
6117 case nir_tex_src_texture_deref:
6118 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
6119 break;
6120 case nir_tex_src_sampler_deref:
6121 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
6122 break;
6123 case nir_tex_src_plane:
6124 plane = nir_src_as_int(instr->src[i].src);
6125 break;
6126 default:
6127 break;
6128 }
6129 }
6130
6131 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
6132
6133 if (!sampler_deref_instr)
6134 sampler_deref_instr = texture_deref_instr;
6135
6136 if (plane >= 0) {
6137 assert(instr->op != nir_texop_txf_ms &&
6138 instr->op != nir_texop_samples_identical);
6139 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
6140 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
6141 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6142 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
6143 } else {
6144 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
6145 }
6146 if (samp_ptr) {
6147 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
6148 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
6149 fprintf(stderr, "Unimplemented sampler descriptor: ");
6150 nir_print_instr(&instr->instr, stderr);
6151 fprintf(stderr, "\n");
6152 abort();
6153 // TODO: build samp_ptr = and(samp_ptr, res_ptr)
6154 }
6155 }
6156 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
6157 instr->op == nir_texop_samples_identical))
6158 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
6159 }
6160
6161 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
6162 Temp *out_ma, Temp *out_sc, Temp *out_tc)
6163 {
6164 Builder bld(ctx->program, ctx->block);
6165
6166 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
6167 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
6168 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
6169
6170 Operand neg_one(0xbf800000u);
6171 Operand one(0x3f800000u);
6172 Operand two(0x40000000u);
6173 Operand four(0x40800000u);
6174
6175 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
6176 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
6177 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
6178
6179 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
6180 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
6181 is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
6182 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
6183
6184 // select sc
6185 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
6186 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
6187 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
6188 one, is_ma_y);
6189 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6190
6191 // select tc
6192 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
6193 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
6194 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6195
6196 // select ma
6197 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6198 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
6199 deriv_z, is_ma_z);
6200 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
6201 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
6202 }
6203
6204 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
6205 {
6206 Builder bld(ctx->program, ctx->block);
6207 Temp coord_args[4], ma, tc, sc, id;
6208 for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
6209 coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
6210
6211 if (is_array) {
6212 coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
6213
6214 // see comment in ac_prepare_cube_coords()
6215 if (ctx->options->chip_class <= GFX8)
6216 coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
6217 }
6218
6219 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6220
6221 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6222 vop3a->operands[0] = Operand(ma);
6223 vop3a->abs[0] = true;
6224 Temp invma = bld.tmp(v1);
6225 vop3a->definitions[0] = Definition(invma);
6226 ctx->block->instructions.emplace_back(std::move(vop3a));
6227
6228 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6229 if (!is_deriv)
6230 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6231
6232 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6233 if (!is_deriv)
6234 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6235
6236 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6237
6238 if (is_deriv) {
6239 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6240 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6241
6242 for (unsigned i = 0; i < 2; i++) {
6243 // see comment in ac_prepare_cube_coords()
6244 Temp deriv_ma;
6245 Temp deriv_sc, deriv_tc;
6246 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6247 &deriv_ma, &deriv_sc, &deriv_tc);
6248
6249 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6250
6251 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6252 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6253 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6254 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6255 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6256 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6257 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6258 }
6259
6260 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6261 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6262 }
6263
6264 if (is_array)
6265 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6266 *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6267
6268 }
6269
6270 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6271 {
6272 Temp coord_vec[3];
6273 for (unsigned i = 0; i < coords.size(); i++)
6274 coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6275
6276 Builder bld(ctx->program, ctx->block);
6277 coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6278
6279 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6280 for (unsigned i = 0; i < coords.size(); i++)
6281 vec->operands[i] = Operand(coord_vec[i]);
6282 Temp res = bld.tmp(RegType::vgpr, coords.size());
6283 vec->definitions[0] = Definition(res);
6284 ctx->block->instructions.emplace_back(std::move(vec));
6285 return res;
6286 }
6287
6288 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6289 {
6290 if (vec->parent_instr->type != nir_instr_type_alu)
6291 return;
6292 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6293 if (vec_instr->op != nir_op_vec(vec->num_components))
6294 return;
6295
6296 for (unsigned i = 0; i < vec->num_components; i++) {
6297 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6298 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6299 }
6300 }
6301
6302 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6303 {
6304 Builder bld(ctx->program, ctx->block);
6305 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6306 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6307 Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6308 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6309 nir_const_value *sample_index_cv = NULL;
6310 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6311 enum glsl_base_type stype;
6312 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6313
6314 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6315 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6316 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6317 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6318
6319 for (unsigned i = 0; i < instr->num_srcs; i++) {
6320 switch (instr->src[i].src_type) {
6321 case nir_tex_src_coord:
6322 coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6323 break;
6324 case nir_tex_src_bias:
6325 if (instr->op == nir_texop_txb) {
6326 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6327 has_bias = true;
6328 }
6329 break;
6330 case nir_tex_src_lod: {
6331 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6332
6333 if (val && val->f32 <= 0.0) {
6334 level_zero = true;
6335 } else {
6336 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6337 has_lod = true;
6338 }
6339 break;
6340 }
6341 case nir_tex_src_comparator:
6342 if (instr->is_shadow) {
6343 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6344 has_compare = true;
6345 }
6346 break;
6347 case nir_tex_src_offset:
6348 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6349 get_const_vec(instr->src[i].src.ssa, const_offset);
6350 has_offset = true;
6351 break;
6352 case nir_tex_src_ddx:
6353 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6354 has_ddx = true;
6355 break;
6356 case nir_tex_src_ddy:
6357 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6358 has_ddy = true;
6359 break;
6360 case nir_tex_src_ms_index:
6361 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6362 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6363 has_sample_index = true;
6364 break;
6365 case nir_tex_src_texture_offset:
6366 case nir_tex_src_sampler_offset:
6367 default:
6368 break;
6369 }
6370 }
6371 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6372 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6373 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6374
6375 if (instr->op == nir_texop_texture_samples) {
6376 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6377
6378 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6379 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6380 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6381 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6382
6383 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6384 samples, Operand(1u), bld.scc(is_msaa));
6385 return;
6386 }
6387
6388 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6389 aco_ptr<Instruction> tmp_instr;
6390 Temp acc, pack = Temp();
6391
6392 uint32_t pack_const = 0;
6393 for (unsigned i = 0; i < offset.size(); i++) {
6394 if (!const_offset[i])
6395 continue;
6396 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6397 }
6398
6399 if (offset.type() == RegType::sgpr) {
6400 for (unsigned i = 0; i < offset.size(); i++) {
6401 if (const_offset[i])
6402 continue;
6403
6404 acc = emit_extract_vector(ctx, offset, i, s1);
6405 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6406
6407 if (i) {
6408 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6409 }
6410
6411 if (pack == Temp()) {
6412 pack = acc;
6413 } else {
6414 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6415 }
6416 }
6417
6418 if (pack_const && pack != Temp())
6419 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6420 } else {
6421 for (unsigned i = 0; i < offset.size(); i++) {
6422 if (const_offset[i])
6423 continue;
6424
6425 acc = emit_extract_vector(ctx, offset, i, v1);
6426 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6427
6428 if (i) {
6429 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6430 }
6431
6432 if (pack == Temp()) {
6433 pack = acc;
6434 } else {
6435 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6436 }
6437 }
6438
6439 if (pack_const && pack != Temp())
6440 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6441 }
6442 if (pack_const && pack == Temp())
6443 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6444 else if (pack == Temp())
6445 has_offset = false;
6446 else
6447 offset = pack;
6448 }
6449
6450 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6451 prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6452
6453 /* pack derivatives */
6454 if (has_ddx || has_ddy) {
6455 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6456 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6457 ddx, Operand(0u), ddy, Operand(0u));
6458 } else {
6459 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6460 }
6461 has_derivs = true;
6462 }
6463
6464 if (instr->coord_components > 1 &&
6465 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6466 instr->is_array &&
6467 instr->op != nir_texop_txf)
6468 coords = apply_round_slice(ctx, coords, 1);
6469
6470 if (instr->coord_components > 2 &&
6471 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6472 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6473 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6474 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6475 instr->is_array &&
6476 instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6477 coords = apply_round_slice(ctx, coords, 2);
6478
6479 if (ctx->options->chip_class == GFX9 &&
6480 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6481 instr->op != nir_texop_lod && instr->coord_components) {
6482 assert(coords.size() > 0 && coords.size() < 3);
6483
6484 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6485 vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6486 vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6487 if (coords.size() > 1)
6488 vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6489 coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6490 vec->definitions[0] = Definition(coords);
6491 ctx->block->instructions.emplace_back(std::move(vec));
6492 }
6493
6494 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6495
6496 if (instr->op == nir_texop_samples_identical)
6497 resource = fmask_ptr;
6498
6499 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6500 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6501 instr->op != nir_texop_txs) {
6502 assert(has_sample_index);
6503 Operand op(sample_index);
6504 if (sample_index_cv)
6505 op = Operand(sample_index_cv->u32);
6506 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6507 }
6508
6509 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6510 Temp split_coords[coords.size()];
6511 emit_split_vector(ctx, coords, coords.size());
6512 for (unsigned i = 0; i < coords.size(); i++)
6513 split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6514
6515 unsigned i = 0;
6516 for (; i < std::min(offset.size(), instr->coord_components); i++) {
6517 Temp off = emit_extract_vector(ctx, offset, i, v1);
6518 split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6519 }
6520
6521 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6522 for (unsigned i = 0; i < coords.size(); i++)
6523 vec->operands[i] = Operand(split_coords[i]);
6524 coords = bld.tmp(coords.regClass());
6525 vec->definitions[0] = Definition(coords);
6526 ctx->block->instructions.emplace_back(std::move(vec));
6527
6528 has_offset = false;
6529 }
6530
6531 /* Build tex instruction */
6532 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6533 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6534 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6535 : 0;
6536 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6537 Temp tmp_dst = dst;
6538
6539 /* gather4 selects the component by dmask and always returns vec4 */
6540 if (instr->op == nir_texop_tg4) {
6541 assert(instr->dest.ssa.num_components == 4);
6542 if (instr->is_shadow)
6543 dmask = 1;
6544 else
6545 dmask = 1 << instr->component;
6546 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6547 tmp_dst = bld.tmp(v4);
6548 } else if (instr->op == nir_texop_samples_identical) {
6549 tmp_dst = bld.tmp(v1);
6550 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6551 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6552 }
6553
6554 aco_ptr<MIMG_instruction> tex;
6555 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6556 if (!has_lod)
6557 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6558
6559 bool div_by_6 = instr->op == nir_texop_txs &&
6560 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6561 instr->is_array &&
6562 (dmask & (1 << 2));
6563 if (tmp_dst.id() == dst.id() && div_by_6)
6564 tmp_dst = bld.tmp(tmp_dst.regClass());
6565
6566 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6567 tex->operands[0] = Operand(as_vgpr(ctx,lod));
6568 tex->operands[1] = Operand(resource);
6569 if (ctx->options->chip_class == GFX9 &&
6570 instr->op == nir_texop_txs &&
6571 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6572 instr->is_array) {
6573 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6574 } else if (instr->op == nir_texop_query_levels) {
6575 tex->dmask = 1 << 3;
6576 } else {
6577 tex->dmask = dmask;
6578 }
6579 tex->da = da;
6580 tex->definitions[0] = Definition(tmp_dst);
6581 tex->dim = dim;
6582 tex->can_reorder = true;
6583 ctx->block->instructions.emplace_back(std::move(tex));
6584
6585 if (div_by_6) {
6586 /* divide 3rd value by 6 by multiplying with magic number */
6587 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6588 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6589 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6590 assert(instr->dest.ssa.num_components == 3);
6591 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6592 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6593 emit_extract_vector(ctx, tmp_dst, 0, v1),
6594 emit_extract_vector(ctx, tmp_dst, 1, v1),
6595 by_6);
6596
6597 }
6598
6599 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6600 return;
6601 }
6602
6603 Temp tg4_compare_cube_wa64 = Temp();
6604
6605 if (tg4_integer_workarounds) {
6606 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6607 tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6608 tex->operands[1] = Operand(resource);
6609 tex->dim = dim;
6610 tex->dmask = 0x3;
6611 tex->da = da;
6612 Temp size = bld.tmp(v2);
6613 tex->definitions[0] = Definition(size);
6614 tex->can_reorder = true;
6615 ctx->block->instructions.emplace_back(std::move(tex));
6616 emit_split_vector(ctx, size, size.size());
6617
6618 Temp half_texel[2];
6619 for (unsigned i = 0; i < 2; i++) {
6620 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6621 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6622 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6623 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6624 }
6625
6626 Temp orig_coords[2] = {
6627 emit_extract_vector(ctx, coords, 0, v1),
6628 emit_extract_vector(ctx, coords, 1, v1)};
6629 Temp new_coords[2] = {
6630 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6631 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6632 };
6633
6634 if (tg4_integer_cube_workaround) {
6635 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6636 Temp desc[resource.size()];
6637 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6638 Format::PSEUDO, 1, resource.size())};
6639 split->operands[0] = Operand(resource);
6640 for (unsigned i = 0; i < resource.size(); i++) {
6641 desc[i] = bld.tmp(s1);
6642 split->definitions[i] = Definition(desc[i]);
6643 }
6644 ctx->block->instructions.emplace_back(std::move(split));
6645
6646 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6647 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6648 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6649
6650 Temp nfmt;
6651 if (stype == GLSL_TYPE_UINT) {
6652 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6653 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6654 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6655 bld.scc(compare_cube_wa));
6656 } else {
6657 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6658 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6659 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6660 bld.scc(compare_cube_wa));
6661 }
6662 tg4_compare_cube_wa64 = bld.tmp(s2);
6663 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
6664
6665 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6666
6667 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6668 Operand((uint32_t)C_008F14_NUM_FORMAT));
6669 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6670
6671 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6672 Format::PSEUDO, resource.size(), 1)};
6673 for (unsigned i = 0; i < resource.size(); i++)
6674 vec->operands[i] = Operand(desc[i]);
6675 resource = bld.tmp(resource.regClass());
6676 vec->definitions[0] = Definition(resource);
6677 ctx->block->instructions.emplace_back(std::move(vec));
6678
6679 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6680 new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6681 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6682 new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6683 }
6684
6685 if (coords.size() == 3) {
6686 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6687 new_coords[0], new_coords[1],
6688 emit_extract_vector(ctx, coords, 2, v1));
6689 } else {
6690 assert(coords.size() == 2);
6691 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6692 new_coords[0], new_coords[1]);
6693 }
6694 }
6695
6696 std::vector<Operand> args;
6697 if (has_offset)
6698 args.emplace_back(Operand(offset));
6699 if (has_bias)
6700 args.emplace_back(Operand(bias));
6701 if (has_compare)
6702 args.emplace_back(Operand(compare));
6703 if (has_derivs)
6704 args.emplace_back(Operand(derivs));
6705 args.emplace_back(Operand(coords));
6706 if (has_sample_index)
6707 args.emplace_back(Operand(sample_index));
6708 if (has_lod)
6709 args.emplace_back(lod);
6710
6711 Temp arg;
6712 if (args.size() > 1) {
6713 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6714 unsigned size = 0;
6715 for (unsigned i = 0; i < args.size(); i++) {
6716 size += args[i].size();
6717 vec->operands[i] = args[i];
6718 }
6719 RegClass rc = RegClass(RegType::vgpr, size);
6720 Temp tmp = bld.tmp(rc);
6721 vec->definitions[0] = Definition(tmp);
6722 ctx->block->instructions.emplace_back(std::move(vec));
6723 arg = tmp;
6724 } else {
6725 assert(args[0].isTemp());
6726 arg = as_vgpr(ctx, args[0].getTemp());
6727 }
6728
6729 /* we don't need the bias, sample index, compare value or offset to be
6730 * computed in WQM but if the p_create_vector copies the coordinates, then it
6731 * needs to be in WQM */
6732 if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6733 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6734 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6735 arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
6736
6737 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6738 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6739
6740 assert(coords.size() == 1);
6741 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6742 aco_opcode op;
6743 switch (last_bit) {
6744 case 1:
6745 op = aco_opcode::buffer_load_format_x; break;
6746 case 2:
6747 op = aco_opcode::buffer_load_format_xy; break;
6748 case 3:
6749 op = aco_opcode::buffer_load_format_xyz; break;
6750 case 4:
6751 op = aco_opcode::buffer_load_format_xyzw; break;
6752 default:
6753 unreachable("Tex instruction loads more than 4 components.");
6754 }
6755
6756 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6757 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6758 tmp_dst = dst;
6759 else
6760 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6761
6762 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6763 mubuf->operands[0] = Operand(coords);
6764 mubuf->operands[1] = Operand(resource);
6765 mubuf->operands[2] = Operand((uint32_t) 0);
6766 mubuf->definitions[0] = Definition(tmp_dst);
6767 mubuf->idxen = true;
6768 mubuf->can_reorder = true;
6769 ctx->block->instructions.emplace_back(std::move(mubuf));
6770
6771 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6772 return;
6773 }
6774
6775
6776 if (instr->op == nir_texop_txf ||
6777 instr->op == nir_texop_txf_ms ||
6778 instr->op == nir_texop_samples_identical) {
6779 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6780 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6781 tex->operands[0] = Operand(arg);
6782 tex->operands[1] = Operand(resource);
6783 tex->dim = dim;
6784 tex->dmask = dmask;
6785 tex->unrm = true;
6786 tex->da = da;
6787 tex->definitions[0] = Definition(tmp_dst);
6788 tex->can_reorder = true;
6789 ctx->block->instructions.emplace_back(std::move(tex));
6790
6791 if (instr->op == nir_texop_samples_identical) {
6792 assert(dmask == 1 && dst.regClass() == v1);
6793 assert(dst.id() != tmp_dst.id());
6794
6795 Temp tmp = bld.tmp(s2);
6796 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6797 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6798
6799 } else {
6800 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6801 }
6802 return;
6803 }
6804
6805 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6806 aco_opcode opcode = aco_opcode::image_sample;
6807 if (has_offset) { /* image_sample_*_o */
6808 if (has_compare) {
6809 opcode = aco_opcode::image_sample_c_o;
6810 if (has_derivs)
6811 opcode = aco_opcode::image_sample_c_d_o;
6812 if (has_bias)
6813 opcode = aco_opcode::image_sample_c_b_o;
6814 if (level_zero)
6815 opcode = aco_opcode::image_sample_c_lz_o;
6816 if (has_lod)
6817 opcode = aco_opcode::image_sample_c_l_o;
6818 } else {
6819 opcode = aco_opcode::image_sample_o;
6820 if (has_derivs)
6821 opcode = aco_opcode::image_sample_d_o;
6822 if (has_bias)
6823 opcode = aco_opcode::image_sample_b_o;
6824 if (level_zero)
6825 opcode = aco_opcode::image_sample_lz_o;
6826 if (has_lod)
6827 opcode = aco_opcode::image_sample_l_o;
6828 }
6829 } else { /* no offset */
6830 if (has_compare) {
6831 opcode = aco_opcode::image_sample_c;
6832 if (has_derivs)
6833 opcode = aco_opcode::image_sample_c_d;
6834 if (has_bias)
6835 opcode = aco_opcode::image_sample_c_b;
6836 if (level_zero)
6837 opcode = aco_opcode::image_sample_c_lz;
6838 if (has_lod)
6839 opcode = aco_opcode::image_sample_c_l;
6840 } else {
6841 opcode = aco_opcode::image_sample;
6842 if (has_derivs)
6843 opcode = aco_opcode::image_sample_d;
6844 if (has_bias)
6845 opcode = aco_opcode::image_sample_b;
6846 if (level_zero)
6847 opcode = aco_opcode::image_sample_lz;
6848 if (has_lod)
6849 opcode = aco_opcode::image_sample_l;
6850 }
6851 }
6852
6853 if (instr->op == nir_texop_tg4) {
6854 if (has_offset) {
6855 opcode = aco_opcode::image_gather4_lz_o;
6856 if (has_compare)
6857 opcode = aco_opcode::image_gather4_c_lz_o;
6858 } else {
6859 opcode = aco_opcode::image_gather4_lz;
6860 if (has_compare)
6861 opcode = aco_opcode::image_gather4_c_lz;
6862 }
6863 } else if (instr->op == nir_texop_lod) {
6864 opcode = aco_opcode::image_get_lod;
6865 }
6866
6867 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6868 tex->operands[0] = Operand(arg);
6869 tex->operands[1] = Operand(resource);
6870 tex->operands[2] = Operand(sampler);
6871 tex->dim = dim;
6872 tex->dmask = dmask;
6873 tex->da = da;
6874 tex->definitions[0] = Definition(tmp_dst);
6875 tex->can_reorder = true;
6876 ctx->block->instructions.emplace_back(std::move(tex));
6877
6878 if (tg4_integer_cube_workaround) {
6879 assert(tmp_dst.id() != dst.id());
6880 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6881
6882 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6883 Temp val[4];
6884 for (unsigned i = 0; i < dst.size(); i++) {
6885 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6886 Temp cvt_val;
6887 if (stype == GLSL_TYPE_UINT)
6888 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6889 else
6890 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6891 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6892 }
6893 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6894 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6895 val[0], val[1], val[2], val[3]);
6896 }
6897 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6898 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6899
6900 }
6901
6902
6903 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6904 {
6905 Temp tmp = get_ssa_temp(ctx, ssa);
6906 if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6907 return Operand(tmp.regClass());
6908 else
6909 return Operand(tmp);
6910 }
6911
6912 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6913 {
6914 aco_ptr<Pseudo_instruction> phi;
6915 unsigned num_src = exec_list_length(&instr->srcs);
6916 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6917 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2);
6918
6919 aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6920
6921 std::map<unsigned, nir_ssa_def*> phi_src;
6922 bool all_undef = true;
6923 nir_foreach_phi_src(src, instr) {
6924 phi_src[src->pred->index] = src->src.ssa;
6925 if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6926 all_undef = false;
6927 }
6928 if (all_undef) {
6929 Builder bld(ctx->program, ctx->block);
6930 if (dst.regClass() == s1) {
6931 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6932 } else if (dst.regClass() == v1) {
6933 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6934 } else {
6935 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6936 for (unsigned i = 0; i < dst.size(); i++)
6937 vec->operands[i] = Operand(0u);
6938 vec->definitions[0] = Definition(dst);
6939 ctx->block->instructions.emplace_back(std::move(vec));
6940 }
6941 return;
6942 }
6943
6944 /* try to scalarize vector phis */
6945 if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
6946 // TODO: scalarize linear phis on divergent ifs
6947 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6948 std::array<Temp, 4> new_vec;
6949 for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6950 Operand src = get_phi_operand(ctx, pair.second);
6951 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6952 can_scalarize = false;
6953 break;
6954 }
6955 }
6956 if (can_scalarize) {
6957 unsigned num_components = instr->dest.ssa.num_components;
6958 assert(dst.size() % num_components == 0);
6959 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6960
6961 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6962 for (unsigned k = 0; k < num_components; k++) {
6963 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6964 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6965 for (unsigned i = 0; i < num_src; i++) {
6966 Operand src = get_phi_operand(ctx, it->second);
6967 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6968 ++it;
6969 }
6970 Temp phi_dst = {ctx->program->allocateId(), rc};
6971 phi->definitions[0] = Definition(phi_dst);
6972 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6973 new_vec[k] = phi_dst;
6974 vec->operands[k] = Operand(phi_dst);
6975 }
6976 vec->definitions[0] = Definition(dst);
6977 ctx->block->instructions.emplace_back(std::move(vec));
6978 ctx->allocated_vec.emplace(dst.id(), new_vec);
6979 return;
6980 }
6981 }
6982
6983 unsigned extra_src = 0;
6984 if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6985 ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6986 extra_src++;
6987 }
6988
6989 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6990
6991 /* if we have a linear phi on a divergent if, we know that one src is undef */
6992 if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6993 assert(extra_src == 0);
6994 Block* block;
6995 /* we place the phi either in the invert-block or in the current block */
6996 if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6997 assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6998 Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6999 block = &ctx->program->blocks[linear_else.linear_preds[0]];
7000 assert(block->kind & block_kind_invert);
7001 phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
7002 } else {
7003 assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
7004 block = ctx->block;
7005 phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
7006 }
7007 phi->operands[1] = Operand(dst.regClass());
7008 phi->definitions[0] = Definition(dst);
7009 block->instructions.emplace(block->instructions.begin(), std::move(phi));
7010 return;
7011 }
7012
7013 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
7014 for (unsigned i = 0; i < num_src; i++) {
7015 phi->operands[i] = get_phi_operand(ctx, it->second);
7016 ++it;
7017 }
7018 for (unsigned i = 0; i < extra_src; i++)
7019 phi->operands[num_src + i] = Operand(dst.regClass());
7020 phi->definitions[0] = Definition(dst);
7021 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
7022 }
7023
7024
7025 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
7026 {
7027 Temp dst = get_ssa_temp(ctx, &instr->def);
7028
7029 assert(dst.type() == RegType::sgpr);
7030
7031 if (dst.size() == 1) {
7032 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
7033 } else {
7034 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
7035 for (unsigned i = 0; i < dst.size(); i++)
7036 vec->operands[i] = Operand(0u);
7037 vec->definitions[0] = Definition(dst);
7038 ctx->block->instructions.emplace_back(std::move(vec));
7039 }
7040 }
7041
7042 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
7043 {
7044 Builder bld(ctx->program, ctx->block);
7045 Block *logical_target;
7046 append_logical_end(ctx->block);
7047 unsigned idx = ctx->block->index;
7048
7049 switch (instr->type) {
7050 case nir_jump_break:
7051 logical_target = ctx->cf_info.parent_loop.exit;
7052 add_logical_edge(idx, logical_target);
7053 ctx->block->kind |= block_kind_break;
7054
7055 if (!ctx->cf_info.parent_if.is_divergent &&
7056 !ctx->cf_info.parent_loop.has_divergent_continue) {
7057 /* uniform break - directly jump out of the loop */
7058 ctx->block->kind |= block_kind_uniform;
7059 ctx->cf_info.has_branch = true;
7060 bld.branch(aco_opcode::p_branch);
7061 add_linear_edge(idx, logical_target);
7062 return;
7063 }
7064 ctx->cf_info.parent_loop.has_divergent_branch = true;
7065 break;
7066 case nir_jump_continue:
7067 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7068 add_logical_edge(idx, logical_target);
7069 ctx->block->kind |= block_kind_continue;
7070
7071 if (ctx->cf_info.parent_if.is_divergent) {
7072 /* for potential uniform breaks after this continue,
7073 we must ensure that they are handled correctly */
7074 ctx->cf_info.parent_loop.has_divergent_continue = true;
7075 ctx->cf_info.parent_loop.has_divergent_branch = true;
7076 } else {
7077 /* uniform continue - directly jump to the loop header */
7078 ctx->block->kind |= block_kind_uniform;
7079 ctx->cf_info.has_branch = true;
7080 bld.branch(aco_opcode::p_branch);
7081 add_linear_edge(idx, logical_target);
7082 return;
7083 }
7084 break;
7085 default:
7086 fprintf(stderr, "Unknown NIR jump instr: ");
7087 nir_print_instr(&instr->instr, stderr);
7088 fprintf(stderr, "\n");
7089 abort();
7090 }
7091
7092 /* remove critical edges from linear CFG */
7093 bld.branch(aco_opcode::p_branch);
7094 Block* break_block = ctx->program->create_and_insert_block();
7095 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7096 break_block->kind |= block_kind_uniform;
7097 add_linear_edge(idx, break_block);
7098 /* the loop_header pointer might be invalidated by this point */
7099 if (instr->type == nir_jump_continue)
7100 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7101 add_linear_edge(break_block->index, logical_target);
7102 bld.reset(break_block);
7103 bld.branch(aco_opcode::p_branch);
7104
7105 Block* continue_block = ctx->program->create_and_insert_block();
7106 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7107 add_linear_edge(idx, continue_block);
7108 append_logical_start(continue_block);
7109 ctx->block = continue_block;
7110 return;
7111 }
7112
7113 void visit_block(isel_context *ctx, nir_block *block)
7114 {
7115 nir_foreach_instr(instr, block) {
7116 switch (instr->type) {
7117 case nir_instr_type_alu:
7118 visit_alu_instr(ctx, nir_instr_as_alu(instr));
7119 break;
7120 case nir_instr_type_load_const:
7121 visit_load_const(ctx, nir_instr_as_load_const(instr));
7122 break;
7123 case nir_instr_type_intrinsic:
7124 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
7125 break;
7126 case nir_instr_type_tex:
7127 visit_tex(ctx, nir_instr_as_tex(instr));
7128 break;
7129 case nir_instr_type_phi:
7130 visit_phi(ctx, nir_instr_as_phi(instr));
7131 break;
7132 case nir_instr_type_ssa_undef:
7133 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
7134 break;
7135 case nir_instr_type_deref:
7136 break;
7137 case nir_instr_type_jump:
7138 visit_jump(ctx, nir_instr_as_jump(instr));
7139 break;
7140 default:
7141 fprintf(stderr, "Unknown NIR instr type: ");
7142 nir_print_instr(instr, stderr);
7143 fprintf(stderr, "\n");
7144 //abort();
7145 }
7146 }
7147 }
7148
7149
7150
7151 static void visit_loop(isel_context *ctx, nir_loop *loop)
7152 {
7153 append_logical_end(ctx->block);
7154 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
7155 Builder bld(ctx->program, ctx->block);
7156 bld.branch(aco_opcode::p_branch);
7157 unsigned loop_preheader_idx = ctx->block->index;
7158
7159 Block loop_exit = Block();
7160 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7161 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
7162
7163 Block* loop_header = ctx->program->create_and_insert_block();
7164 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
7165 loop_header->kind |= block_kind_loop_header;
7166 add_edge(loop_preheader_idx, loop_header);
7167 ctx->block = loop_header;
7168
7169 /* emit loop body */
7170 unsigned loop_header_idx = loop_header->index;
7171 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
7172 append_logical_start(ctx->block);
7173 visit_cf_list(ctx, &loop->body);
7174
7175 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
7176 if (!ctx->cf_info.has_branch) {
7177 append_logical_end(ctx->block);
7178 if (ctx->cf_info.exec_potentially_empty) {
7179 /* Discards can result in code running with an empty exec mask.
7180 * This would result in divergent breaks not ever being taken. As a
7181 * workaround, break the loop when the loop mask is empty instead of
7182 * always continuing. */
7183 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
7184 unsigned block_idx = ctx->block->index;
7185
7186 /* create helper blocks to avoid critical edges */
7187 Block *break_block = ctx->program->create_and_insert_block();
7188 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7189 break_block->kind = block_kind_uniform;
7190 bld.reset(break_block);
7191 bld.branch(aco_opcode::p_branch);
7192 add_linear_edge(block_idx, break_block);
7193 add_linear_edge(break_block->index, &loop_exit);
7194
7195 Block *continue_block = ctx->program->create_and_insert_block();
7196 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7197 continue_block->kind = block_kind_uniform;
7198 bld.reset(continue_block);
7199 bld.branch(aco_opcode::p_branch);
7200 add_linear_edge(block_idx, continue_block);
7201 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
7202
7203 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
7204 ctx->block = &ctx->program->blocks[block_idx];
7205 } else {
7206 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
7207 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7208 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7209 else
7210 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7211 }
7212
7213 bld.reset(ctx->block);
7214 bld.branch(aco_opcode::p_branch);
7215 }
7216
7217 /* fixup phis in loop header from unreachable blocks */
7218 if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
7219 bool linear = ctx->cf_info.has_branch;
7220 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
7221 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
7222 if ((logical && instr->opcode == aco_opcode::p_phi) ||
7223 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
7224 /* the last operand should be the one that needs to be removed */
7225 instr->operands.pop_back();
7226 } else if (!is_phi(instr)) {
7227 break;
7228 }
7229 }
7230 }
7231
7232 ctx->cf_info.has_branch = false;
7233
7234 // TODO: if the loop has not a single exit, we must add one °°
7235 /* emit loop successor block */
7236 ctx->block = ctx->program->insert_block(std::move(loop_exit));
7237 append_logical_start(ctx->block);
7238
7239 #if 0
7240 // TODO: check if it is beneficial to not branch on continues
7241 /* trim linear phis in loop header */
7242 for (auto&& instr : loop_entry->instructions) {
7243 if (instr->opcode == aco_opcode::p_linear_phi) {
7244 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7245 new_phi->definitions[0] = instr->definitions[0];
7246 for (unsigned i = 0; i < new_phi->operands.size(); i++)
7247 new_phi->operands[i] = instr->operands[i];
7248 /* check that the remaining operands are all the same */
7249 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7250 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7251 instr.swap(new_phi);
7252 } else if (instr->opcode == aco_opcode::p_phi) {
7253 continue;
7254 } else {
7255 break;
7256 }
7257 }
7258 #endif
7259 }
7260
7261 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7262 {
7263 ic->cond = cond;
7264
7265 append_logical_end(ctx->block);
7266 ctx->block->kind |= block_kind_branch;
7267
7268 /* branch to linear then block */
7269 assert(cond.regClass() == s2);
7270 aco_ptr<Pseudo_branch_instruction> branch;
7271 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7272 branch->operands[0] = Operand(cond);
7273 ctx->block->instructions.push_back(std::move(branch));
7274
7275 ic->BB_if_idx = ctx->block->index;
7276 ic->BB_invert = Block();
7277 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7278 /* Invert blocks are intentionally not marked as top level because they
7279 * are not part of the logical cfg. */
7280 ic->BB_invert.kind |= block_kind_invert;
7281 ic->BB_endif = Block();
7282 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7283 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7284
7285 ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7286 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7287 ctx->cf_info.parent_if.is_divergent = true;
7288 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7289
7290 /** emit logical then block */
7291 Block* BB_then_logical = ctx->program->create_and_insert_block();
7292 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7293 add_edge(ic->BB_if_idx, BB_then_logical);
7294 ctx->block = BB_then_logical;
7295 append_logical_start(BB_then_logical);
7296 }
7297
7298 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7299 {
7300 Block *BB_then_logical = ctx->block;
7301 append_logical_end(BB_then_logical);
7302 /* branch from logical then block to invert block */
7303 aco_ptr<Pseudo_branch_instruction> branch;
7304 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7305 BB_then_logical->instructions.emplace_back(std::move(branch));
7306 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7307 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7308 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7309 BB_then_logical->kind |= block_kind_uniform;
7310 assert(!ctx->cf_info.has_branch);
7311 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7312 ctx->cf_info.parent_loop.has_divergent_branch = false;
7313
7314 /** emit linear then block */
7315 Block* BB_then_linear = ctx->program->create_and_insert_block();
7316 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7317 BB_then_linear->kind |= block_kind_uniform;
7318 add_linear_edge(ic->BB_if_idx, BB_then_linear);
7319 /* branch from linear then block to invert block */
7320 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7321 BB_then_linear->instructions.emplace_back(std::move(branch));
7322 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7323
7324 /** emit invert merge block */
7325 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7326 ic->invert_idx = ctx->block->index;
7327
7328 /* branch to linear else block (skip else) */
7329 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7330 branch->operands[0] = Operand(ic->cond);
7331 ctx->block->instructions.push_back(std::move(branch));
7332
7333 ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7334 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7335
7336 /** emit logical else block */
7337 Block* BB_else_logical = ctx->program->create_and_insert_block();
7338 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7339 add_logical_edge(ic->BB_if_idx, BB_else_logical);
7340 add_linear_edge(ic->invert_idx, BB_else_logical);
7341 ctx->block = BB_else_logical;
7342 append_logical_start(BB_else_logical);
7343 }
7344
7345 static void end_divergent_if(isel_context *ctx, if_context *ic)
7346 {
7347 Block *BB_else_logical = ctx->block;
7348 append_logical_end(BB_else_logical);
7349
7350 /* branch from logical else block to endif block */
7351 aco_ptr<Pseudo_branch_instruction> branch;
7352 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7353 BB_else_logical->instructions.emplace_back(std::move(branch));
7354 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7355 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7356 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7357 BB_else_logical->kind |= block_kind_uniform;
7358
7359 assert(!ctx->cf_info.has_branch);
7360 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7361
7362
7363 /** emit linear else block */
7364 Block* BB_else_linear = ctx->program->create_and_insert_block();
7365 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7366 BB_else_linear->kind |= block_kind_uniform;
7367 add_linear_edge(ic->invert_idx, BB_else_linear);
7368
7369 /* branch from linear else block to endif block */
7370 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7371 BB_else_linear->instructions.emplace_back(std::move(branch));
7372 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7373
7374
7375 /** emit endif merge block */
7376 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7377 append_logical_start(ctx->block);
7378
7379
7380 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7381 ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7382 /* uniform control flow never has an empty exec-mask */
7383 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7384 ctx->cf_info.exec_potentially_empty = false;
7385 }
7386
7387 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7388 {
7389 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7390 Builder bld(ctx->program, ctx->block);
7391 aco_ptr<Pseudo_branch_instruction> branch;
7392
7393 if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7394 /**
7395 * Uniform conditionals are represented in the following way*) :
7396 *
7397 * The linear and logical CFG:
7398 * BB_IF
7399 * / \
7400 * BB_THEN (logical) BB_ELSE (logical)
7401 * \ /
7402 * BB_ENDIF
7403 *
7404 * *) Exceptions may be due to break and continue statements within loops
7405 * If a break/continue happens within uniform control flow, it branches
7406 * to the loop exit/entry block. Otherwise, it branches to the next
7407 * merge block.
7408 **/
7409 append_logical_end(ctx->block);
7410 ctx->block->kind |= block_kind_uniform;
7411
7412 /* emit branch */
7413 assert(cond.regClass() == s2);
7414 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7415 cond = bool_to_scalar_condition(ctx, cond);
7416
7417 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7418 branch->operands[0] = Operand(cond);
7419 branch->operands[0].setFixed(scc);
7420 ctx->block->instructions.emplace_back(std::move(branch));
7421
7422 unsigned BB_if_idx = ctx->block->index;
7423 Block BB_endif = Block();
7424 BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7425 BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7426
7427 /** emit then block */
7428 Block* BB_then = ctx->program->create_and_insert_block();
7429 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7430 add_edge(BB_if_idx, BB_then);
7431 append_logical_start(BB_then);
7432 ctx->block = BB_then;
7433 visit_cf_list(ctx, &if_stmt->then_list);
7434 BB_then = ctx->block;
7435 bool then_branch = ctx->cf_info.has_branch;
7436 bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7437
7438 if (!then_branch) {
7439 append_logical_end(BB_then);
7440 /* branch from then block to endif block */
7441 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7442 BB_then->instructions.emplace_back(std::move(branch));
7443 add_linear_edge(BB_then->index, &BB_endif);
7444 if (!then_branch_divergent)
7445 add_logical_edge(BB_then->index, &BB_endif);
7446 BB_then->kind |= block_kind_uniform;
7447 }
7448
7449 ctx->cf_info.has_branch = false;
7450 ctx->cf_info.parent_loop.has_divergent_branch = false;
7451
7452 /** emit else block */
7453 Block* BB_else = ctx->program->create_and_insert_block();
7454 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7455 add_edge(BB_if_idx, BB_else);
7456 append_logical_start(BB_else);
7457 ctx->block = BB_else;
7458 visit_cf_list(ctx, &if_stmt->else_list);
7459 BB_else = ctx->block;
7460
7461 if (!ctx->cf_info.has_branch) {
7462 append_logical_end(BB_else);
7463 /* branch from then block to endif block */
7464 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7465 BB_else->instructions.emplace_back(std::move(branch));
7466 add_linear_edge(BB_else->index, &BB_endif);
7467 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7468 add_logical_edge(BB_else->index, &BB_endif);
7469 BB_else->kind |= block_kind_uniform;
7470 }
7471
7472 ctx->cf_info.has_branch &= then_branch;
7473 ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7474
7475 /** emit endif merge block */
7476 if (!ctx->cf_info.has_branch) {
7477 ctx->block = ctx->program->insert_block(std::move(BB_endif));
7478 append_logical_start(ctx->block);
7479 }
7480 } else { /* non-uniform condition */
7481 /**
7482 * To maintain a logical and linear CFG without critical edges,
7483 * non-uniform conditionals are represented in the following way*) :
7484 *
7485 * The linear CFG:
7486 * BB_IF
7487 * / \
7488 * BB_THEN (logical) BB_THEN (linear)
7489 * \ /
7490 * BB_INVERT (linear)
7491 * / \
7492 * BB_ELSE (logical) BB_ELSE (linear)
7493 * \ /
7494 * BB_ENDIF
7495 *
7496 * The logical CFG:
7497 * BB_IF
7498 * / \
7499 * BB_THEN (logical) BB_ELSE (logical)
7500 * \ /
7501 * BB_ENDIF
7502 *
7503 * *) Exceptions may be due to break and continue statements within loops
7504 **/
7505
7506 if_context ic;
7507
7508 begin_divergent_if_then(ctx, &ic, cond);
7509 visit_cf_list(ctx, &if_stmt->then_list);
7510
7511 begin_divergent_if_else(ctx, &ic);
7512 visit_cf_list(ctx, &if_stmt->else_list);
7513
7514 end_divergent_if(ctx, &ic);
7515 }
7516 }
7517
7518 static void visit_cf_list(isel_context *ctx,
7519 struct exec_list *list)
7520 {
7521 foreach_list_typed(nir_cf_node, node, node, list) {
7522 switch (node->type) {
7523 case nir_cf_node_block:
7524 visit_block(ctx, nir_cf_node_as_block(node));
7525 break;
7526 case nir_cf_node_if:
7527 visit_if(ctx, nir_cf_node_as_if(node));
7528 break;
7529 case nir_cf_node_loop:
7530 visit_loop(ctx, nir_cf_node_as_loop(node));
7531 break;
7532 default:
7533 unreachable("unimplemented cf list type");
7534 }
7535 }
7536 }
7537
7538 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7539 {
7540 int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7541 uint64_t mask = ctx->vs_output.mask[slot];
7542 if (!is_pos && !mask)
7543 return;
7544 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7545 return;
7546 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7547 exp->enabled_mask = mask;
7548 for (unsigned i = 0; i < 4; ++i) {
7549 if (mask & (1 << i))
7550 exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7551 else
7552 exp->operands[i] = Operand(v1);
7553 }
7554 exp->valid_mask = false;
7555 exp->done = false;
7556 exp->compressed = false;
7557 if (is_pos)
7558 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7559 else
7560 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7561 ctx->block->instructions.emplace_back(std::move(exp));
7562 }
7563
7564 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7565 {
7566 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7567 exp->enabled_mask = 0;
7568 for (unsigned i = 0; i < 4; ++i)
7569 exp->operands[i] = Operand(v1);
7570 if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7571 exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7572 exp->enabled_mask |= 0x1;
7573 }
7574 if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7575 exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7576 exp->enabled_mask |= 0x4;
7577 }
7578 if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7579 if (ctx->options->chip_class < GFX9) {
7580 exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7581 exp->enabled_mask |= 0x8;
7582 } else {
7583 Builder bld(ctx->program, ctx->block);
7584
7585 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7586 Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7587 if (exp->operands[2].isTemp())
7588 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7589
7590 exp->operands[2] = Operand(out);
7591 exp->enabled_mask |= 0x4;
7592 }
7593 }
7594 exp->valid_mask = false;
7595 exp->done = false;
7596 exp->compressed = false;
7597 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7598 ctx->block->instructions.emplace_back(std::move(exp));
7599 }
7600
7601 static void create_vs_exports(isel_context *ctx)
7602 {
7603 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7604
7605 if (outinfo->export_prim_id) {
7606 ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7607 ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
7608 }
7609
7610 if (ctx->options->key.has_multiview_view_index) {
7611 ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7612 ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
7613 }
7614
7615 /* the order these position exports are created is important */
7616 int next_pos = 0;
7617 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7618 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7619 export_vs_psiz_layer_viewport(ctx, &next_pos);
7620 }
7621 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7622 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7623 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7624 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7625
7626 if (ctx->options->key.vs_common_out.export_clip_dists) {
7627 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7628 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7629 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7630 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7631 }
7632
7633 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7634 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7635 i != VARYING_SLOT_PRIMITIVE_ID)
7636 continue;
7637
7638 export_vs_varying(ctx, i, false, NULL);
7639 }
7640 }
7641
7642 static void emit_stream_output(isel_context *ctx,
7643 Temp const *so_buffers,
7644 Temp const *so_write_offset,
7645 const struct radv_stream_output *output)
7646 {
7647 unsigned num_comps = util_bitcount(output->component_mask);
7648 unsigned loc = output->location;
7649 unsigned buf = output->buffer;
7650 unsigned offset = output->offset;
7651
7652 assert(num_comps && num_comps <= 4);
7653 if (!num_comps || num_comps > 4)
7654 return;
7655
7656 unsigned start = ffs(output->component_mask) - 1;
7657
7658 Temp out[4];
7659 bool all_undef = true;
7660 assert(ctx->stage == vertex_vs);
7661 for (unsigned i = 0; i < num_comps; i++) {
7662 out[i] = ctx->vs_output.outputs[loc][start + i];
7663 all_undef = all_undef && !out[i].id();
7664 }
7665 if (all_undef)
7666 return;
7667
7668 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7669 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7670 for (unsigned i = 0; i < num_comps; ++i)
7671 vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7672 vec->definitions[0] = Definition(write_data);
7673 ctx->block->instructions.emplace_back(std::move(vec));
7674
7675 aco_opcode opcode;
7676 switch (num_comps) {
7677 case 1:
7678 opcode = aco_opcode::buffer_store_dword;
7679 break;
7680 case 2:
7681 opcode = aco_opcode::buffer_store_dwordx2;
7682 break;
7683 case 3:
7684 opcode = aco_opcode::buffer_store_dwordx3;
7685 break;
7686 case 4:
7687 opcode = aco_opcode::buffer_store_dwordx4;
7688 break;
7689 }
7690
7691 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7692 store->operands[0] = Operand(so_write_offset[buf]);
7693 store->operands[1] = Operand(so_buffers[buf]);
7694 store->operands[2] = Operand((uint32_t) 0);
7695 store->operands[3] = Operand(write_data);
7696 if (offset > 4095) {
7697 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7698 Builder bld(ctx->program, ctx->block);
7699 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7700 } else {
7701 store->offset = offset;
7702 }
7703 store->offen = true;
7704 store->glc = true;
7705 store->dlc = false;
7706 store->slc = true;
7707 store->can_reorder = true;
7708 ctx->block->instructions.emplace_back(std::move(store));
7709 }
7710
7711 static void emit_streamout(isel_context *ctx, unsigned stream)
7712 {
7713 Builder bld(ctx->program, ctx->block);
7714
7715 Temp so_buffers[4];
7716 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
7717 for (unsigned i = 0; i < 4; i++) {
7718 unsigned stride = ctx->program->info->so.strides[i];
7719 if (!stride)
7720 continue;
7721
7722 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7723 }
7724
7725 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7726 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
7727
7728 Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7729 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7730
7731 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7732
7733 if_context ic;
7734 begin_divergent_if_then(ctx, &ic, can_emit);
7735
7736 bld.reset(ctx->block);
7737
7738 Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
7739
7740 Temp so_write_offset[4];
7741
7742 for (unsigned i = 0; i < 4; i++) {
7743 unsigned stride = ctx->program->info->so.strides[i];
7744 if (!stride)
7745 continue;
7746
7747 if (stride == 1) {
7748 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7749 get_arg(ctx, ctx->args->streamout_write_idx),
7750 get_arg(ctx, ctx->args->streamout_offset[i]));
7751 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7752
7753 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7754 } else {
7755 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7756 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
7757 get_arg(ctx, ctx->args->streamout_offset[i]));
7758 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7759 }
7760 }
7761
7762 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7763 struct radv_stream_output *output =
7764 &ctx->program->info->so.outputs[i];
7765 if (stream != output->stream)
7766 continue;
7767
7768 emit_stream_output(ctx, so_buffers, so_write_offset, output);
7769 }
7770
7771 begin_divergent_if_else(ctx, &ic);
7772 end_divergent_if(ctx, &ic);
7773 }
7774
7775 } /* end namespace */
7776
7777 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
7778 {
7779 /* Split all arguments except for the first (ring_offsets) and the last
7780 * (exec) so that the dead channels don't stay live throughout the program.
7781 */
7782 for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
7783 if (startpgm->definitions[i].regClass().size() > 1) {
7784 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
7785 startpgm->definitions[i].regClass().size());
7786 }
7787 }
7788 }
7789
7790 void handle_bc_optimize(isel_context *ctx)
7791 {
7792 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7793 Builder bld(ctx->program, ctx->block);
7794 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7795 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7796 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7797 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
7798 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
7799 if (uses_center && uses_centroid) {
7800 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)),
7801 get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
7802
7803 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7804 Temp new_coord[2];
7805 for (unsigned i = 0; i < 2; i++) {
7806 Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
7807 Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
7808 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7809 persp_centroid, persp_center, sel);
7810 }
7811 ctx->persp_centroid = bld.tmp(v2);
7812 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
7813 Operand(new_coord[0]), Operand(new_coord[1]));
7814 emit_split_vector(ctx, ctx->persp_centroid, 2);
7815 }
7816
7817 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7818 Temp new_coord[2];
7819 for (unsigned i = 0; i < 2; i++) {
7820 Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
7821 Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
7822 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7823 linear_centroid, linear_center, sel);
7824 }
7825 ctx->linear_centroid = bld.tmp(v2);
7826 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
7827 Operand(new_coord[0]), Operand(new_coord[1]));
7828 emit_split_vector(ctx, ctx->linear_centroid, 2);
7829 }
7830 }
7831 }
7832
7833 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
7834 {
7835 Program *program = ctx->program;
7836
7837 unsigned float_controls = shader->info.float_controls_execution_mode;
7838
7839 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
7840 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
7841 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
7842 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
7843 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
7844
7845 program->next_fp_mode.must_flush_denorms32 =
7846 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
7847 program->next_fp_mode.must_flush_denorms16_64 =
7848 float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
7849 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
7850
7851 program->next_fp_mode.care_about_round32 =
7852 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
7853
7854 program->next_fp_mode.care_about_round16_64 =
7855 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
7856 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
7857
7858 /* default to preserving fp16 and fp64 denorms, since it's free */
7859 if (program->next_fp_mode.must_flush_denorms16_64)
7860 program->next_fp_mode.denorm16_64 = 0;
7861 else
7862 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
7863
7864 /* preserving fp32 denorms is expensive, so only do it if asked */
7865 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
7866 program->next_fp_mode.denorm32 = fp_denorm_keep;
7867 else
7868 program->next_fp_mode.denorm32 = 0;
7869
7870 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
7871 program->next_fp_mode.round32 = fp_round_tz;
7872 else
7873 program->next_fp_mode.round32 = fp_round_ne;
7874
7875 if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
7876 program->next_fp_mode.round16_64 = fp_round_tz;
7877 else
7878 program->next_fp_mode.round16_64 = fp_round_ne;
7879
7880 ctx->block->fp_mode = program->next_fp_mode;
7881 }
7882
7883 void select_program(Program *program,
7884 unsigned shader_count,
7885 struct nir_shader *const *shaders,
7886 ac_shader_config* config,
7887 struct radv_shader_args *args)
7888 {
7889 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
7890
7891 for (unsigned i = 0; i < shader_count; i++) {
7892 nir_shader *nir = shaders[i];
7893 init_context(&ctx, nir);
7894
7895 setup_fp_mode(&ctx, nir);
7896
7897 if (!i) {
7898 /* needs to be after init_context() for FS */
7899 Pseudo_instruction *startpgm = add_startpgm(&ctx);
7900 append_logical_start(ctx.block);
7901 split_arguments(&ctx, startpgm);
7902 }
7903
7904 if_context ic;
7905 if (shader_count >= 2) {
7906 Builder bld(ctx.program, ctx.block);
7907 Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7908 Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7909 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7910 Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7911
7912 begin_divergent_if_then(&ctx, &ic, cond);
7913 }
7914
7915 if (i) {
7916 Builder bld(ctx.program, ctx.block);
7917 bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7918 bld.sopp(aco_opcode::s_barrier);
7919 }
7920
7921 if (ctx.stage == fragment_fs)
7922 handle_bc_optimize(&ctx);
7923
7924 nir_function_impl *func = nir_shader_get_entrypoint(nir);
7925 visit_cf_list(&ctx, &func->body);
7926
7927 if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7928 emit_streamout(&ctx, 0);
7929
7930 if (ctx.stage == vertex_vs)
7931 create_vs_exports(&ctx);
7932
7933 if (shader_count >= 2) {
7934 begin_divergent_if_else(&ctx, &ic);
7935 end_divergent_if(&ctx, &ic);
7936 }
7937
7938 ralloc_free(ctx.divergent_vals);
7939 }
7940
7941 program->config->float_mode = program->blocks[0].fp_mode.val;
7942
7943 append_logical_end(ctx.block);
7944 ctx.block->kind |= block_kind_uniform;
7945 Builder bld(ctx.program, ctx.block);
7946 if (ctx.program->wb_smem_l1_on_end)
7947 bld.smem(aco_opcode::s_dcache_wb, false);
7948 bld.sopp(aco_opcode::s_endpgm);
7949
7950 /* cleanup CFG */
7951 for (Block& BB : program->blocks) {
7952 for (unsigned idx : BB.linear_preds)
7953 program->blocks[idx].linear_succs.emplace_back(BB.index);
7954 for (unsigned idx : BB.logical_preds)
7955 program->blocks[idx].logical_succs.emplace_back(BB.index);
7956 }
7957 }
7958 }