aco: Optimize load_subgroup_id to one bit field extract instruction.
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
127 {
128 Builder bld(ctx->program, ctx->block);
129
130 if (!dst.id())
131 dst = bld.tmp(src.regClass());
132
133 assert(src.size() == dst.size());
134
135 if (ctx->stage != fragment_fs) {
136 if (!dst.id())
137 return src;
138
139 bld.copy(Definition(dst), src);
140 return dst;
141 }
142
143 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
144 ctx->program->needs_wqm |= program_needs_wqm;
145 return dst;
146 }
147
148 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
149 {
150 if (index.regClass() == s1)
151 return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
152
153 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
154
155 /* Currently not implemented on GFX6-7 */
156 assert(ctx->options->chip_class >= GFX8);
157
158 if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
159 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
160 }
161
162 /* GFX10, wave64 mode:
163 * The bpermute instruction is limited to half-wave operation, which means that it can't
164 * properly support subgroup shuffle like older generations (or wave32 mode), so we
165 * emulate it here.
166 */
167 if (!ctx->has_gfx10_wave64_bpermute) {
168 ctx->has_gfx10_wave64_bpermute = true;
169 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
170 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
171 }
172
173 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u));
174 lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id);
175 Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
176 Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
177 Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
178
179 return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
180 bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
181 }
182
183 Temp as_vgpr(isel_context *ctx, Temp val)
184 {
185 if (val.type() == RegType::sgpr) {
186 Builder bld(ctx->program, ctx->block);
187 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
188 }
189 assert(val.type() == RegType::vgpr);
190 return val;
191 }
192
193 //assumes a != 0xffffffff
194 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
195 {
196 assert(b != 0);
197 Builder bld(ctx->program, ctx->block);
198
199 if (util_is_power_of_two_or_zero(b)) {
200 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
201 return;
202 }
203
204 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
205
206 assert(info.multiplier <= 0xffffffff);
207
208 bool pre_shift = info.pre_shift != 0;
209 bool increment = info.increment != 0;
210 bool multiply = true;
211 bool post_shift = info.post_shift != 0;
212
213 if (!pre_shift && !increment && !multiply && !post_shift) {
214 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
215 return;
216 }
217
218 Temp pre_shift_dst = a;
219 if (pre_shift) {
220 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
221 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
222 }
223
224 Temp increment_dst = pre_shift_dst;
225 if (increment) {
226 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
227 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
228 }
229
230 Temp multiply_dst = increment_dst;
231 if (multiply) {
232 multiply_dst = post_shift ? bld.tmp(v1) : dst;
233 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
234 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
235 }
236
237 if (post_shift) {
238 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
239 }
240 }
241
242 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
243 {
244 Builder bld(ctx->program, ctx->block);
245 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
246 }
247
248
249 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
250 {
251 /* no need to extract the whole vector */
252 if (src.regClass() == dst_rc) {
253 assert(idx == 0);
254 return src;
255 }
256 assert(src.size() > idx);
257 Builder bld(ctx->program, ctx->block);
258 auto it = ctx->allocated_vec.find(src.id());
259 /* the size check needs to be early because elements other than 0 may be garbage */
260 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
261 if (it->second[idx].regClass() == dst_rc) {
262 return it->second[idx];
263 } else {
264 assert(dst_rc.size() == it->second[idx].regClass().size());
265 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
266 return bld.copy(bld.def(dst_rc), it->second[idx]);
267 }
268 }
269
270 if (src.size() == dst_rc.size()) {
271 assert(idx == 0);
272 return bld.copy(bld.def(dst_rc), src);
273 } else {
274 Temp dst = bld.tmp(dst_rc);
275 emit_extract_vector(ctx, src, idx, dst);
276 return dst;
277 }
278 }
279
280 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
281 {
282 if (num_components == 1)
283 return;
284 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
285 return;
286 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
287 split->operands[0] = Operand(vec_src);
288 std::array<Temp,4> elems;
289 for (unsigned i = 0; i < num_components; i++) {
290 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
291 split->definitions[i] = Definition(elems[i]);
292 }
293 ctx->block->instructions.emplace_back(std::move(split));
294 ctx->allocated_vec.emplace(vec_src.id(), elems);
295 }
296
297 /* This vector expansion uses a mask to determine which elements in the new vector
298 * come from the original vector. The other elements are undefined. */
299 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
300 {
301 emit_split_vector(ctx, vec_src, util_bitcount(mask));
302
303 if (vec_src == dst)
304 return;
305
306 Builder bld(ctx->program, ctx->block);
307 if (num_components == 1) {
308 if (dst.type() == RegType::sgpr)
309 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
310 else
311 bld.copy(Definition(dst), vec_src);
312 return;
313 }
314
315 unsigned component_size = dst.size() / num_components;
316 std::array<Temp,4> elems;
317
318 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
319 vec->definitions[0] = Definition(dst);
320 unsigned k = 0;
321 for (unsigned i = 0; i < num_components; i++) {
322 if (mask & (1 << i)) {
323 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
324 if (dst.type() == RegType::sgpr)
325 src = bld.as_uniform(src);
326 vec->operands[i] = Operand(src);
327 } else {
328 vec->operands[i] = Operand(0u);
329 }
330 elems[i] = vec->operands[i].getTemp();
331 }
332 ctx->block->instructions.emplace_back(std::move(vec));
333 ctx->allocated_vec.emplace(dst.id(), elems);
334 }
335
336 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
337 {
338 Builder bld(ctx->program, ctx->block);
339 if (!dst.id())
340 dst = bld.tmp(s2);
341
342 assert(val.regClass() == s1);
343 assert(dst.regClass() == s2);
344
345 return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
346 }
347
348 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
349 {
350 Builder bld(ctx->program, ctx->block);
351 if (!dst.id())
352 dst = bld.tmp(s1);
353
354 assert(val.regClass() == s2);
355 assert(dst.regClass() == s1);
356
357 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
358 Temp tmp = bld.tmp(s1);
359 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2));
360 return emit_wqm(ctx, tmp, dst);
361 }
362
363 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
364 {
365 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
366 return get_ssa_temp(ctx, src.src.ssa);
367
368 if (src.src.ssa->num_components == size) {
369 bool identity_swizzle = true;
370 for (unsigned i = 0; identity_swizzle && i < size; i++) {
371 if (src.swizzle[i] != i)
372 identity_swizzle = false;
373 }
374 if (identity_swizzle)
375 return get_ssa_temp(ctx, src.src.ssa);
376 }
377
378 Temp vec = get_ssa_temp(ctx, src.src.ssa);
379 unsigned elem_size = vec.size() / src.src.ssa->num_components;
380 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
381 assert(vec.size() % elem_size == 0);
382
383 RegClass elem_rc = RegClass(vec.type(), elem_size);
384 if (size == 1) {
385 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
386 } else {
387 assert(size <= 4);
388 std::array<Temp,4> elems;
389 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
390 for (unsigned i = 0; i < size; ++i) {
391 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
392 vec_instr->operands[i] = Operand{elems[i]};
393 }
394 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
395 vec_instr->definitions[0] = Definition(dst);
396 ctx->block->instructions.emplace_back(std::move(vec_instr));
397 ctx->allocated_vec.emplace(dst.id(), elems);
398 return dst;
399 }
400 }
401
402 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
403 {
404 if (ptr.size() == 2)
405 return ptr;
406 Builder bld(ctx->program, ctx->block);
407 if (ptr.type() == RegType::vgpr)
408 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
409 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
410 ptr, Operand((unsigned)ctx->options->address32_hi));
411 }
412
413 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
414 {
415 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
416 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
417 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
418 sop2->definitions[0] = Definition(dst);
419 if (writes_scc)
420 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
421 ctx->block->instructions.emplace_back(std::move(sop2));
422 }
423
424 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
425 {
426 Builder bld(ctx->program, ctx->block);
427 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
428 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
429 if (src1.type() == RegType::sgpr) {
430 if (commutative && src0.type() == RegType::vgpr) {
431 Temp t = src0;
432 src0 = src1;
433 src1 = t;
434 } else if (src0.type() == RegType::vgpr &&
435 op != aco_opcode::v_madmk_f32 &&
436 op != aco_opcode::v_madak_f32 &&
437 op != aco_opcode::v_madmk_f16 &&
438 op != aco_opcode::v_madak_f16) {
439 /* If the instruction is not commutative, we emit a VOP3A instruction */
440 bld.vop2_e64(op, Definition(dst), src0, src1);
441 return;
442 } else {
443 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
444 }
445 }
446 bld.vop2(op, Definition(dst), src0, src1);
447 }
448
449 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
450 {
451 Temp src0 = get_alu_src(ctx, instr->src[0]);
452 Temp src1 = get_alu_src(ctx, instr->src[1]);
453 Temp src2 = get_alu_src(ctx, instr->src[2]);
454
455 /* ensure that the instruction has at most 1 sgpr operand
456 * The optimizer will inline constants for us */
457 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
458 src0 = as_vgpr(ctx, src0);
459 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
460 src1 = as_vgpr(ctx, src1);
461 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
462 src2 = as_vgpr(ctx, src2);
463
464 Builder bld(ctx->program, ctx->block);
465 bld.vop3(op, Definition(dst), src0, src1, src2);
466 }
467
468 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
469 {
470 Builder bld(ctx->program, ctx->block);
471 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
472 }
473
474 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
475 {
476 Temp src0 = get_alu_src(ctx, instr->src[0]);
477 Temp src1 = get_alu_src(ctx, instr->src[1]);
478 aco_ptr<Instruction> vopc;
479 if (src1.type() == RegType::sgpr) {
480 if (src0.type() == RegType::vgpr) {
481 /* to swap the operands, we might also have to change the opcode */
482 switch (op) {
483 case aco_opcode::v_cmp_lt_f32:
484 op = aco_opcode::v_cmp_gt_f32;
485 break;
486 case aco_opcode::v_cmp_ge_f32:
487 op = aco_opcode::v_cmp_le_f32;
488 break;
489 case aco_opcode::v_cmp_lt_i32:
490 op = aco_opcode::v_cmp_gt_i32;
491 break;
492 case aco_opcode::v_cmp_ge_i32:
493 op = aco_opcode::v_cmp_le_i32;
494 break;
495 case aco_opcode::v_cmp_lt_u32:
496 op = aco_opcode::v_cmp_gt_u32;
497 break;
498 case aco_opcode::v_cmp_ge_u32:
499 op = aco_opcode::v_cmp_le_u32;
500 break;
501 case aco_opcode::v_cmp_lt_f64:
502 op = aco_opcode::v_cmp_gt_f64;
503 break;
504 case aco_opcode::v_cmp_ge_f64:
505 op = aco_opcode::v_cmp_le_f64;
506 break;
507 case aco_opcode::v_cmp_lt_i64:
508 op = aco_opcode::v_cmp_gt_i64;
509 break;
510 case aco_opcode::v_cmp_ge_i64:
511 op = aco_opcode::v_cmp_le_i64;
512 break;
513 case aco_opcode::v_cmp_lt_u64:
514 op = aco_opcode::v_cmp_gt_u64;
515 break;
516 case aco_opcode::v_cmp_ge_u64:
517 op = aco_opcode::v_cmp_le_u64;
518 break;
519 default: /* eq and ne are commutative */
520 break;
521 }
522 Temp t = src0;
523 src0 = src1;
524 src1 = t;
525 } else {
526 src1 = as_vgpr(ctx, src1);
527 }
528 }
529
530 Builder bld(ctx->program, ctx->block);
531 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
532 }
533
534 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
535 {
536 Temp src0 = get_alu_src(ctx, instr->src[0]);
537 Temp src1 = get_alu_src(ctx, instr->src[1]);
538
539 assert(dst.regClass() == s2);
540 assert(src0.type() == RegType::sgpr);
541 assert(src1.type() == RegType::sgpr);
542
543 Builder bld(ctx->program, ctx->block);
544 /* Emit the SALU comparison instruction */
545 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
546 /* Turn the result into a per-lane bool */
547 bool_to_vector_condition(ctx, cmp, dst);
548 }
549
550 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
551 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
552 {
553 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
554 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
555 bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
556 bool use_valu = s_op == aco_opcode::last_opcode ||
557 divergent_vals ||
558 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
559 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
560 aco_opcode op = use_valu ? v_op : s_op;
561 assert(op != aco_opcode::last_opcode);
562
563 if (use_valu)
564 emit_vopc_instruction(ctx, instr, op, dst);
565 else
566 emit_sopc_instruction(ctx, instr, op, dst);
567 }
568
569 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op64, Temp dst)
570 {
571 Builder bld(ctx->program, ctx->block);
572 Temp src0 = get_alu_src(ctx, instr->src[0]);
573 Temp src1 = get_alu_src(ctx, instr->src[1]);
574
575 assert(dst.regClass() == s2);
576 assert(src0.regClass() == s2);
577 assert(src1.regClass() == s2);
578
579 bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1);
580 }
581
582 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
583 {
584 Builder bld(ctx->program, ctx->block);
585 Temp cond = get_alu_src(ctx, instr->src[0]);
586 Temp then = get_alu_src(ctx, instr->src[1]);
587 Temp els = get_alu_src(ctx, instr->src[2]);
588
589 assert(cond.regClass() == s2);
590
591 if (dst.type() == RegType::vgpr) {
592 aco_ptr<Instruction> bcsel;
593 if (dst.size() == 1) {
594 then = as_vgpr(ctx, then);
595 els = as_vgpr(ctx, els);
596
597 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
598 } else if (dst.size() == 2) {
599 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
600 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
601 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
602 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
603
604 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
605 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
606
607 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
608 } else {
609 fprintf(stderr, "Unimplemented NIR instr bit size: ");
610 nir_print_instr(&instr->instr, stderr);
611 fprintf(stderr, "\n");
612 }
613 return;
614 }
615
616 if (instr->dest.dest.ssa.bit_size == 1) {
617 assert(dst.regClass() == s2);
618 assert(then.regClass() == s2);
619 assert(els.regClass() == s2);
620 }
621
622 if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
623 if (dst.regClass() == s1 || dst.regClass() == s2) {
624 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
625 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
626 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
627 } else {
628 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
629 nir_print_instr(&instr->instr, stderr);
630 fprintf(stderr, "\n");
631 }
632 return;
633 }
634
635 /* divergent boolean bcsel
636 * this implements bcsel on bools: dst = s0 ? s1 : s2
637 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
638 assert(instr->dest.dest.ssa.bit_size == 1);
639
640 if (cond.id() != then.id())
641 then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
642
643 if (cond.id() == els.id())
644 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
645 else
646 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
647 bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
648 }
649
650 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
651 aco_opcode op, uint32_t undo)
652 {
653 /* multiply by 16777216 to handle denormals */
654 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)),
655 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
656 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
657 scaled = bld.vop1(op, bld.def(v1), scaled);
658 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
659
660 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
661
662 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
663 }
664
665 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
666 {
667 if (ctx->block->fp_mode.denorm32 == 0) {
668 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
669 return;
670 }
671
672 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
673 }
674
675 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
676 {
677 if (ctx->block->fp_mode.denorm32 == 0) {
678 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
679 return;
680 }
681
682 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
683 }
684
685 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
686 {
687 if (ctx->block->fp_mode.denorm32 == 0) {
688 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
689 return;
690 }
691
692 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
693 }
694
695 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
696 {
697 if (ctx->block->fp_mode.denorm32 == 0) {
698 bld.vop1(aco_opcode::v_log_f32, dst, val);
699 return;
700 }
701
702 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
703 }
704
705 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
706 {
707 if (!instr->dest.dest.is_ssa) {
708 fprintf(stderr, "nir alu dst not in ssa: ");
709 nir_print_instr(&instr->instr, stderr);
710 fprintf(stderr, "\n");
711 abort();
712 }
713 Builder bld(ctx->program, ctx->block);
714 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
715 switch(instr->op) {
716 case nir_op_vec2:
717 case nir_op_vec3:
718 case nir_op_vec4: {
719 std::array<Temp,4> elems;
720 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
721 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
722 elems[i] = get_alu_src(ctx, instr->src[i]);
723 vec->operands[i] = Operand{elems[i]};
724 }
725 vec->definitions[0] = Definition(dst);
726 ctx->block->instructions.emplace_back(std::move(vec));
727 ctx->allocated_vec.emplace(dst.id(), elems);
728 break;
729 }
730 case nir_op_mov: {
731 Temp src = get_alu_src(ctx, instr->src[0]);
732 aco_ptr<Instruction> mov;
733 if (dst.type() == RegType::sgpr) {
734 if (src.type() == RegType::vgpr)
735 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
736 else if (src.regClass() == s1)
737 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
738 else if (src.regClass() == s2)
739 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
740 else
741 unreachable("wrong src register class for nir_op_imov");
742 } else if (dst.regClass() == v1) {
743 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
744 } else if (dst.regClass() == v2) {
745 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
746 } else {
747 nir_print_instr(&instr->instr, stderr);
748 unreachable("Should have been lowered to scalar.");
749 }
750 break;
751 }
752 case nir_op_inot: {
753 Temp src = get_alu_src(ctx, instr->src[0]);
754 if (instr->dest.dest.ssa.bit_size == 1) {
755 assert(src.regClass() == s2);
756 assert(dst.regClass() == s2);
757 bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src);
758 } else if (dst.regClass() == v1) {
759 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
760 } else if (dst.type() == RegType::sgpr) {
761 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
762 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
763 } else {
764 fprintf(stderr, "Unimplemented NIR instr bit size: ");
765 nir_print_instr(&instr->instr, stderr);
766 fprintf(stderr, "\n");
767 }
768 break;
769 }
770 case nir_op_ineg: {
771 Temp src = get_alu_src(ctx, instr->src[0]);
772 if (dst.regClass() == v1) {
773 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
774 } else if (dst.regClass() == s1) {
775 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
776 } else if (dst.size() == 2) {
777 Temp src0 = bld.tmp(dst.type(), 1);
778 Temp src1 = bld.tmp(dst.type(), 1);
779 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
780
781 if (dst.regClass() == s2) {
782 Temp carry = bld.tmp(s1);
783 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
784 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
785 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
786 } else {
787 Temp lower = bld.tmp(v1);
788 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
789 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
790 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
791 }
792 } else {
793 fprintf(stderr, "Unimplemented NIR instr bit size: ");
794 nir_print_instr(&instr->instr, stderr);
795 fprintf(stderr, "\n");
796 }
797 break;
798 }
799 case nir_op_iabs: {
800 if (dst.regClass() == s1) {
801 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
802 } else if (dst.regClass() == v1) {
803 Temp src = get_alu_src(ctx, instr->src[0]);
804 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
805 } else {
806 fprintf(stderr, "Unimplemented NIR instr bit size: ");
807 nir_print_instr(&instr->instr, stderr);
808 fprintf(stderr, "\n");
809 }
810 break;
811 }
812 case nir_op_isign: {
813 Temp src = get_alu_src(ctx, instr->src[0]);
814 if (dst.regClass() == s1) {
815 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
816 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
817 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
818 } else if (dst.regClass() == s2) {
819 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
820 Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
821 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
822 } else if (dst.regClass() == v1) {
823 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
824 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
825 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
826 } else if (dst.regClass() == v2) {
827 Temp upper = emit_extract_vector(ctx, src, 1, v1);
828 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
829 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
830 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
831 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
832 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
833 } else {
834 fprintf(stderr, "Unimplemented NIR instr bit size: ");
835 nir_print_instr(&instr->instr, stderr);
836 fprintf(stderr, "\n");
837 }
838 break;
839 }
840 case nir_op_imax: {
841 if (dst.regClass() == v1) {
842 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
843 } else if (dst.regClass() == s1) {
844 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
845 } else {
846 fprintf(stderr, "Unimplemented NIR instr bit size: ");
847 nir_print_instr(&instr->instr, stderr);
848 fprintf(stderr, "\n");
849 }
850 break;
851 }
852 case nir_op_umax: {
853 if (dst.regClass() == v1) {
854 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
855 } else if (dst.regClass() == s1) {
856 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
857 } else {
858 fprintf(stderr, "Unimplemented NIR instr bit size: ");
859 nir_print_instr(&instr->instr, stderr);
860 fprintf(stderr, "\n");
861 }
862 break;
863 }
864 case nir_op_imin: {
865 if (dst.regClass() == v1) {
866 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
867 } else if (dst.regClass() == s1) {
868 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
869 } else {
870 fprintf(stderr, "Unimplemented NIR instr bit size: ");
871 nir_print_instr(&instr->instr, stderr);
872 fprintf(stderr, "\n");
873 }
874 break;
875 }
876 case nir_op_umin: {
877 if (dst.regClass() == v1) {
878 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
879 } else if (dst.regClass() == s1) {
880 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
881 } else {
882 fprintf(stderr, "Unimplemented NIR instr bit size: ");
883 nir_print_instr(&instr->instr, stderr);
884 fprintf(stderr, "\n");
885 }
886 break;
887 }
888 case nir_op_ior: {
889 if (instr->dest.dest.ssa.bit_size == 1) {
890 emit_boolean_logic(ctx, instr, aco_opcode::s_or_b64, dst);
891 } else if (dst.regClass() == v1) {
892 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
893 } else if (dst.regClass() == s1) {
894 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
895 } else if (dst.regClass() == s2) {
896 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
897 } else {
898 fprintf(stderr, "Unimplemented NIR instr bit size: ");
899 nir_print_instr(&instr->instr, stderr);
900 fprintf(stderr, "\n");
901 }
902 break;
903 }
904 case nir_op_iand: {
905 if (instr->dest.dest.ssa.bit_size == 1) {
906 emit_boolean_logic(ctx, instr, aco_opcode::s_and_b64, dst);
907 } else if (dst.regClass() == v1) {
908 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
909 } else if (dst.regClass() == s1) {
910 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
911 } else if (dst.regClass() == s2) {
912 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
913 } else {
914 fprintf(stderr, "Unimplemented NIR instr bit size: ");
915 nir_print_instr(&instr->instr, stderr);
916 fprintf(stderr, "\n");
917 }
918 break;
919 }
920 case nir_op_ixor: {
921 if (instr->dest.dest.ssa.bit_size == 1) {
922 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
923 } else if (dst.regClass() == v1) {
924 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
925 } else if (dst.regClass() == s1) {
926 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
927 } else if (dst.regClass() == s2) {
928 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
929 } else {
930 fprintf(stderr, "Unimplemented NIR instr bit size: ");
931 nir_print_instr(&instr->instr, stderr);
932 fprintf(stderr, "\n");
933 }
934 break;
935 }
936 case nir_op_ushr: {
937 if (dst.regClass() == v1) {
938 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
939 } else if (dst.regClass() == v2) {
940 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
941 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
942 } else if (dst.regClass() == s2) {
943 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
944 } else if (dst.regClass() == s1) {
945 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
946 } else {
947 fprintf(stderr, "Unimplemented NIR instr bit size: ");
948 nir_print_instr(&instr->instr, stderr);
949 fprintf(stderr, "\n");
950 }
951 break;
952 }
953 case nir_op_ishl: {
954 if (dst.regClass() == v1) {
955 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
956 } else if (dst.regClass() == v2) {
957 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
958 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
959 } else if (dst.regClass() == s1) {
960 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
961 } else if (dst.regClass() == s2) {
962 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
963 } else {
964 fprintf(stderr, "Unimplemented NIR instr bit size: ");
965 nir_print_instr(&instr->instr, stderr);
966 fprintf(stderr, "\n");
967 }
968 break;
969 }
970 case nir_op_ishr: {
971 if (dst.regClass() == v1) {
972 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
973 } else if (dst.regClass() == v2) {
974 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
975 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
976 } else if (dst.regClass() == s1) {
977 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
978 } else if (dst.regClass() == s2) {
979 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
980 } else {
981 fprintf(stderr, "Unimplemented NIR instr bit size: ");
982 nir_print_instr(&instr->instr, stderr);
983 fprintf(stderr, "\n");
984 }
985 break;
986 }
987 case nir_op_find_lsb: {
988 Temp src = get_alu_src(ctx, instr->src[0]);
989 if (src.regClass() == s1) {
990 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
991 } else if (src.regClass() == v1) {
992 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
993 } else if (src.regClass() == s2) {
994 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
995 } else {
996 fprintf(stderr, "Unimplemented NIR instr bit size: ");
997 nir_print_instr(&instr->instr, stderr);
998 fprintf(stderr, "\n");
999 }
1000 break;
1001 }
1002 case nir_op_ufind_msb:
1003 case nir_op_ifind_msb: {
1004 Temp src = get_alu_src(ctx, instr->src[0]);
1005 if (src.regClass() == s1 || src.regClass() == s2) {
1006 aco_opcode op = src.regClass() == s2 ?
1007 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1008 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1009 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1010
1011 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1012 Operand(src.size() * 32u - 1u), msb_rev);
1013 Temp msb = sub.def(0).getTemp();
1014 Temp carry = sub.def(1).getTemp();
1015
1016 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
1017 } else if (src.regClass() == v1) {
1018 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1019 Temp msb_rev = bld.tmp(v1);
1020 emit_vop1_instruction(ctx, instr, op, msb_rev);
1021 Temp msb = bld.tmp(v1);
1022 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1023 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1024 } else {
1025 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1026 nir_print_instr(&instr->instr, stderr);
1027 fprintf(stderr, "\n");
1028 }
1029 break;
1030 }
1031 case nir_op_bitfield_reverse: {
1032 if (dst.regClass() == s1) {
1033 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1034 } else if (dst.regClass() == v1) {
1035 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1036 } else {
1037 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1038 nir_print_instr(&instr->instr, stderr);
1039 fprintf(stderr, "\n");
1040 }
1041 break;
1042 }
1043 case nir_op_iadd: {
1044 if (dst.regClass() == s1) {
1045 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1046 break;
1047 }
1048
1049 Temp src0 = get_alu_src(ctx, instr->src[0]);
1050 Temp src1 = get_alu_src(ctx, instr->src[1]);
1051 if (dst.regClass() == v1) {
1052 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1053 break;
1054 }
1055
1056 assert(src0.size() == 2 && src1.size() == 2);
1057 Temp src00 = bld.tmp(src0.type(), 1);
1058 Temp src01 = bld.tmp(dst.type(), 1);
1059 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1060 Temp src10 = bld.tmp(src1.type(), 1);
1061 Temp src11 = bld.tmp(dst.type(), 1);
1062 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1063
1064 if (dst.regClass() == s2) {
1065 Temp carry = bld.tmp(s1);
1066 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1067 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1068 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1069 } else if (dst.regClass() == v2) {
1070 Temp dst0 = bld.tmp(v1);
1071 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1072 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1073 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1074 } else {
1075 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1076 nir_print_instr(&instr->instr, stderr);
1077 fprintf(stderr, "\n");
1078 }
1079 break;
1080 }
1081 case nir_op_uadd_sat: {
1082 Temp src0 = get_alu_src(ctx, instr->src[0]);
1083 Temp src1 = get_alu_src(ctx, instr->src[1]);
1084 if (dst.regClass() == s1) {
1085 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1086 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1087 src0, src1);
1088 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1089 } else if (dst.regClass() == v1) {
1090 if (ctx->options->chip_class >= GFX9) {
1091 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1092 add->operands[0] = Operand(src0);
1093 add->operands[1] = Operand(src1);
1094 add->definitions[0] = Definition(dst);
1095 add->clamp = 1;
1096 ctx->block->instructions.emplace_back(std::move(add));
1097 } else {
1098 if (src1.regClass() != v1)
1099 std::swap(src0, src1);
1100 assert(src1.regClass() == v1);
1101 Temp tmp = bld.tmp(v1);
1102 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1103 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1104 }
1105 } else {
1106 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1107 nir_print_instr(&instr->instr, stderr);
1108 fprintf(stderr, "\n");
1109 }
1110 break;
1111 }
1112 case nir_op_uadd_carry: {
1113 Temp src0 = get_alu_src(ctx, instr->src[0]);
1114 Temp src1 = get_alu_src(ctx, instr->src[1]);
1115 if (dst.regClass() == s1) {
1116 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1117 break;
1118 }
1119 if (dst.regClass() == v1) {
1120 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1121 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1122 break;
1123 }
1124
1125 Temp src00 = bld.tmp(src0.type(), 1);
1126 Temp src01 = bld.tmp(dst.type(), 1);
1127 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1128 Temp src10 = bld.tmp(src1.type(), 1);
1129 Temp src11 = bld.tmp(dst.type(), 1);
1130 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1131 if (dst.regClass() == s2) {
1132 Temp carry = bld.tmp(s1);
1133 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1134 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1135 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1136 } else if (dst.regClass() == v2) {
1137 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1138 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1139 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1140 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1141 } else {
1142 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1143 nir_print_instr(&instr->instr, stderr);
1144 fprintf(stderr, "\n");
1145 }
1146 break;
1147 }
1148 case nir_op_isub: {
1149 if (dst.regClass() == s1) {
1150 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1151 break;
1152 }
1153
1154 Temp src0 = get_alu_src(ctx, instr->src[0]);
1155 Temp src1 = get_alu_src(ctx, instr->src[1]);
1156 if (dst.regClass() == v1) {
1157 bld.vsub32(Definition(dst), src0, src1);
1158 break;
1159 }
1160
1161 Temp src00 = bld.tmp(src0.type(), 1);
1162 Temp src01 = bld.tmp(dst.type(), 1);
1163 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1164 Temp src10 = bld.tmp(src1.type(), 1);
1165 Temp src11 = bld.tmp(dst.type(), 1);
1166 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1167 if (dst.regClass() == s2) {
1168 Temp carry = bld.tmp(s1);
1169 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1170 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1171 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1172 } else if (dst.regClass() == v2) {
1173 Temp lower = bld.tmp(v1);
1174 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1175 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1176 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1177 } else {
1178 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1179 nir_print_instr(&instr->instr, stderr);
1180 fprintf(stderr, "\n");
1181 }
1182 break;
1183 }
1184 case nir_op_usub_borrow: {
1185 Temp src0 = get_alu_src(ctx, instr->src[0]);
1186 Temp src1 = get_alu_src(ctx, instr->src[1]);
1187 if (dst.regClass() == s1) {
1188 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1189 break;
1190 } else if (dst.regClass() == v1) {
1191 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1192 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1193 break;
1194 }
1195
1196 Temp src00 = bld.tmp(src0.type(), 1);
1197 Temp src01 = bld.tmp(dst.type(), 1);
1198 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1199 Temp src10 = bld.tmp(src1.type(), 1);
1200 Temp src11 = bld.tmp(dst.type(), 1);
1201 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1202 if (dst.regClass() == s2) {
1203 Temp borrow = bld.tmp(s1);
1204 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1205 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1206 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1207 } else if (dst.regClass() == v2) {
1208 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1209 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1210 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1211 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1212 } else {
1213 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1214 nir_print_instr(&instr->instr, stderr);
1215 fprintf(stderr, "\n");
1216 }
1217 break;
1218 }
1219 case nir_op_imul: {
1220 if (dst.regClass() == v1) {
1221 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1222 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1223 } else if (dst.regClass() == s1) {
1224 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1225 } else {
1226 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227 nir_print_instr(&instr->instr, stderr);
1228 fprintf(stderr, "\n");
1229 }
1230 break;
1231 }
1232 case nir_op_umul_high: {
1233 if (dst.regClass() == v1) {
1234 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1235 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1236 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1237 } else if (dst.regClass() == s1) {
1238 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1239 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1240 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1241 } else {
1242 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1243 nir_print_instr(&instr->instr, stderr);
1244 fprintf(stderr, "\n");
1245 }
1246 break;
1247 }
1248 case nir_op_imul_high: {
1249 if (dst.regClass() == v1) {
1250 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1251 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1252 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1253 } else if (dst.regClass() == s1) {
1254 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1255 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1256 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1257 } else {
1258 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1259 nir_print_instr(&instr->instr, stderr);
1260 fprintf(stderr, "\n");
1261 }
1262 break;
1263 }
1264 case nir_op_fmul: {
1265 if (dst.size() == 1) {
1266 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1267 } else if (dst.size() == 2) {
1268 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1269 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1270 } else {
1271 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1272 nir_print_instr(&instr->instr, stderr);
1273 fprintf(stderr, "\n");
1274 }
1275 break;
1276 }
1277 case nir_op_fadd: {
1278 if (dst.size() == 1) {
1279 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1280 } else if (dst.size() == 2) {
1281 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1282 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_fsub: {
1291 Temp src0 = get_alu_src(ctx, instr->src[0]);
1292 Temp src1 = get_alu_src(ctx, instr->src[1]);
1293 if (dst.size() == 1) {
1294 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1295 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1296 else
1297 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1298 } else if (dst.size() == 2) {
1299 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1300 get_alu_src(ctx, instr->src[0]),
1301 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1302 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1303 sub->neg[1] = true;
1304 } else {
1305 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1306 nir_print_instr(&instr->instr, stderr);
1307 fprintf(stderr, "\n");
1308 }
1309 break;
1310 }
1311 case nir_op_fmax: {
1312 if (dst.size() == 1) {
1313 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1314 } else if (dst.size() == 2) {
1315 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1316 get_alu_src(ctx, instr->src[0]),
1317 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1318 } else {
1319 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1320 nir_print_instr(&instr->instr, stderr);
1321 fprintf(stderr, "\n");
1322 }
1323 break;
1324 }
1325 case nir_op_fmin: {
1326 if (dst.size() == 1) {
1327 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1328 } else if (dst.size() == 2) {
1329 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1330 get_alu_src(ctx, instr->src[0]),
1331 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1332 } else {
1333 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1334 nir_print_instr(&instr->instr, stderr);
1335 fprintf(stderr, "\n");
1336 }
1337 break;
1338 }
1339 case nir_op_fmax3: {
1340 if (dst.size() == 1) {
1341 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1342 } else {
1343 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1344 nir_print_instr(&instr->instr, stderr);
1345 fprintf(stderr, "\n");
1346 }
1347 break;
1348 }
1349 case nir_op_fmin3: {
1350 if (dst.size() == 1) {
1351 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1352 } else {
1353 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1354 nir_print_instr(&instr->instr, stderr);
1355 fprintf(stderr, "\n");
1356 }
1357 break;
1358 }
1359 case nir_op_fmed3: {
1360 if (dst.size() == 1) {
1361 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1362 } else {
1363 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1364 nir_print_instr(&instr->instr, stderr);
1365 fprintf(stderr, "\n");
1366 }
1367 break;
1368 }
1369 case nir_op_umax3: {
1370 if (dst.size() == 1) {
1371 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1372 } else {
1373 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1374 nir_print_instr(&instr->instr, stderr);
1375 fprintf(stderr, "\n");
1376 }
1377 break;
1378 }
1379 case nir_op_umin3: {
1380 if (dst.size() == 1) {
1381 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1382 } else {
1383 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1384 nir_print_instr(&instr->instr, stderr);
1385 fprintf(stderr, "\n");
1386 }
1387 break;
1388 }
1389 case nir_op_umed3: {
1390 if (dst.size() == 1) {
1391 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1392 } else {
1393 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1394 nir_print_instr(&instr->instr, stderr);
1395 fprintf(stderr, "\n");
1396 }
1397 break;
1398 }
1399 case nir_op_imax3: {
1400 if (dst.size() == 1) {
1401 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1402 } else {
1403 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1404 nir_print_instr(&instr->instr, stderr);
1405 fprintf(stderr, "\n");
1406 }
1407 break;
1408 }
1409 case nir_op_imin3: {
1410 if (dst.size() == 1) {
1411 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1412 } else {
1413 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414 nir_print_instr(&instr->instr, stderr);
1415 fprintf(stderr, "\n");
1416 }
1417 break;
1418 }
1419 case nir_op_imed3: {
1420 if (dst.size() == 1) {
1421 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1422 } else {
1423 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1424 nir_print_instr(&instr->instr, stderr);
1425 fprintf(stderr, "\n");
1426 }
1427 break;
1428 }
1429 case nir_op_cube_face_coord: {
1430 Temp in = get_alu_src(ctx, instr->src[0], 3);
1431 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1432 emit_extract_vector(ctx, in, 1, v1),
1433 emit_extract_vector(ctx, in, 2, v1) };
1434 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1435 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1436 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1437 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1438 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1439 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1440 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1441 break;
1442 }
1443 case nir_op_cube_face_index: {
1444 Temp in = get_alu_src(ctx, instr->src[0], 3);
1445 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1446 emit_extract_vector(ctx, in, 1, v1),
1447 emit_extract_vector(ctx, in, 2, v1) };
1448 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1449 break;
1450 }
1451 case nir_op_bcsel: {
1452 emit_bcsel(ctx, instr, dst);
1453 break;
1454 }
1455 case nir_op_frsq: {
1456 if (dst.size() == 1) {
1457 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1458 } else if (dst.size() == 2) {
1459 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1460 } else {
1461 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1462 nir_print_instr(&instr->instr, stderr);
1463 fprintf(stderr, "\n");
1464 }
1465 break;
1466 }
1467 case nir_op_fneg: {
1468 Temp src = get_alu_src(ctx, instr->src[0]);
1469 if (dst.size() == 1) {
1470 if (ctx->block->fp_mode.must_flush_denorms32)
1471 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1472 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1473 } else if (dst.size() == 2) {
1474 if (ctx->block->fp_mode.must_flush_denorms16_64)
1475 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1476 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1477 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1478 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1479 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1480 } else {
1481 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1482 nir_print_instr(&instr->instr, stderr);
1483 fprintf(stderr, "\n");
1484 }
1485 break;
1486 }
1487 case nir_op_fabs: {
1488 Temp src = get_alu_src(ctx, instr->src[0]);
1489 if (dst.size() == 1) {
1490 if (ctx->block->fp_mode.must_flush_denorms32)
1491 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1492 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1493 } else if (dst.size() == 2) {
1494 if (ctx->block->fp_mode.must_flush_denorms16_64)
1495 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1496 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1497 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1498 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1499 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1500 } else {
1501 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1502 nir_print_instr(&instr->instr, stderr);
1503 fprintf(stderr, "\n");
1504 }
1505 break;
1506 }
1507 case nir_op_fsat: {
1508 Temp src = get_alu_src(ctx, instr->src[0]);
1509 if (dst.size() == 1) {
1510 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1511 } else if (dst.size() == 2) {
1512 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1513 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1514 vop3->clamp = true;
1515 } else {
1516 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1517 nir_print_instr(&instr->instr, stderr);
1518 fprintf(stderr, "\n");
1519 }
1520 break;
1521 }
1522 case nir_op_flog2: {
1523 if (dst.size() == 1) {
1524 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1525 } else {
1526 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1527 nir_print_instr(&instr->instr, stderr);
1528 fprintf(stderr, "\n");
1529 }
1530 break;
1531 }
1532 case nir_op_frcp: {
1533 if (dst.size() == 1) {
1534 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1535 } else if (dst.size() == 2) {
1536 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1537 } else {
1538 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1539 nir_print_instr(&instr->instr, stderr);
1540 fprintf(stderr, "\n");
1541 }
1542 break;
1543 }
1544 case nir_op_fexp2: {
1545 if (dst.size() == 1) {
1546 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1547 } else {
1548 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1549 nir_print_instr(&instr->instr, stderr);
1550 fprintf(stderr, "\n");
1551 }
1552 break;
1553 }
1554 case nir_op_fsqrt: {
1555 if (dst.size() == 1) {
1556 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1557 } else if (dst.size() == 2) {
1558 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1559 } else {
1560 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1561 nir_print_instr(&instr->instr, stderr);
1562 fprintf(stderr, "\n");
1563 }
1564 break;
1565 }
1566 case nir_op_ffract: {
1567 if (dst.size() == 1) {
1568 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1569 } else if (dst.size() == 2) {
1570 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1571 } else {
1572 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1573 nir_print_instr(&instr->instr, stderr);
1574 fprintf(stderr, "\n");
1575 }
1576 break;
1577 }
1578 case nir_op_ffloor: {
1579 if (dst.size() == 1) {
1580 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1581 } else if (dst.size() == 2) {
1582 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1583 } else {
1584 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1585 nir_print_instr(&instr->instr, stderr);
1586 fprintf(stderr, "\n");
1587 }
1588 break;
1589 }
1590 case nir_op_fceil: {
1591 if (dst.size() == 1) {
1592 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1593 } else if (dst.size() == 2) {
1594 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1595 } else {
1596 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1597 nir_print_instr(&instr->instr, stderr);
1598 fprintf(stderr, "\n");
1599 }
1600 break;
1601 }
1602 case nir_op_ftrunc: {
1603 if (dst.size() == 1) {
1604 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1605 } else if (dst.size() == 2) {
1606 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1607 } else {
1608 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1609 nir_print_instr(&instr->instr, stderr);
1610 fprintf(stderr, "\n");
1611 }
1612 break;
1613 }
1614 case nir_op_fround_even: {
1615 if (dst.size() == 1) {
1616 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1617 } else if (dst.size() == 2) {
1618 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1619 } else {
1620 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1621 nir_print_instr(&instr->instr, stderr);
1622 fprintf(stderr, "\n");
1623 }
1624 break;
1625 }
1626 case nir_op_fsin:
1627 case nir_op_fcos: {
1628 Temp src = get_alu_src(ctx, instr->src[0]);
1629 aco_ptr<Instruction> norm;
1630 if (dst.size() == 1) {
1631 Temp tmp;
1632 Operand half_pi(0x3e22f983u);
1633 if (src.type() == RegType::sgpr)
1634 tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1635 else
1636 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1637
1638 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1639 if (ctx->options->chip_class < GFX9)
1640 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1641
1642 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1643 bld.vop1(opcode, Definition(dst), tmp);
1644 } else {
1645 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1646 nir_print_instr(&instr->instr, stderr);
1647 fprintf(stderr, "\n");
1648 }
1649 break;
1650 }
1651 case nir_op_ldexp: {
1652 if (dst.size() == 1) {
1653 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1654 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1655 get_alu_src(ctx, instr->src[1]));
1656 } else if (dst.size() == 2) {
1657 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1658 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1659 get_alu_src(ctx, instr->src[1]));
1660 } else {
1661 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1662 nir_print_instr(&instr->instr, stderr);
1663 fprintf(stderr, "\n");
1664 }
1665 break;
1666 }
1667 case nir_op_frexp_sig: {
1668 if (dst.size() == 1) {
1669 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1670 get_alu_src(ctx, instr->src[0]));
1671 } else if (dst.size() == 2) {
1672 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1673 get_alu_src(ctx, instr->src[0]));
1674 } else {
1675 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1676 nir_print_instr(&instr->instr, stderr);
1677 fprintf(stderr, "\n");
1678 }
1679 break;
1680 }
1681 case nir_op_frexp_exp: {
1682 if (instr->src[0].src.ssa->bit_size == 32) {
1683 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1684 get_alu_src(ctx, instr->src[0]));
1685 } else if (instr->src[0].src.ssa->bit_size == 64) {
1686 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1687 get_alu_src(ctx, instr->src[0]));
1688 } else {
1689 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1690 nir_print_instr(&instr->instr, stderr);
1691 fprintf(stderr, "\n");
1692 }
1693 break;
1694 }
1695 case nir_op_fsign: {
1696 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1697 if (dst.size() == 1) {
1698 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1699 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1700 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1701 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1702 } else if (dst.size() == 2) {
1703 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1704 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1705 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1706
1707 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1708 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1709 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1710
1711 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1712 } else {
1713 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1714 nir_print_instr(&instr->instr, stderr);
1715 fprintf(stderr, "\n");
1716 }
1717 break;
1718 }
1719 case nir_op_f2f32: {
1720 if (instr->src[0].src.ssa->bit_size == 64) {
1721 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1722 } else {
1723 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1724 nir_print_instr(&instr->instr, stderr);
1725 fprintf(stderr, "\n");
1726 }
1727 break;
1728 }
1729 case nir_op_f2f64: {
1730 if (instr->src[0].src.ssa->bit_size == 32) {
1731 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1732 } else {
1733 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1734 nir_print_instr(&instr->instr, stderr);
1735 fprintf(stderr, "\n");
1736 }
1737 break;
1738 }
1739 case nir_op_i2f32: {
1740 assert(dst.size() == 1);
1741 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1742 break;
1743 }
1744 case nir_op_i2f64: {
1745 if (instr->src[0].src.ssa->bit_size == 32) {
1746 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1747 } else if (instr->src[0].src.ssa->bit_size == 64) {
1748 Temp src = get_alu_src(ctx, instr->src[0]);
1749 RegClass rc = RegClass(src.type(), 1);
1750 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1751 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1752 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1753 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1754 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1755 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1756
1757 } else {
1758 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1759 nir_print_instr(&instr->instr, stderr);
1760 fprintf(stderr, "\n");
1761 }
1762 break;
1763 }
1764 case nir_op_u2f32: {
1765 assert(dst.size() == 1);
1766 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1767 break;
1768 }
1769 case nir_op_u2f64: {
1770 if (instr->src[0].src.ssa->bit_size == 32) {
1771 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1772 } else if (instr->src[0].src.ssa->bit_size == 64) {
1773 Temp src = get_alu_src(ctx, instr->src[0]);
1774 RegClass rc = RegClass(src.type(), 1);
1775 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1776 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1777 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1778 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1779 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1780 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1781 } else {
1782 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1783 nir_print_instr(&instr->instr, stderr);
1784 fprintf(stderr, "\n");
1785 }
1786 break;
1787 }
1788 case nir_op_f2i32: {
1789 Temp src = get_alu_src(ctx, instr->src[0]);
1790 if (instr->src[0].src.ssa->bit_size == 32) {
1791 if (dst.type() == RegType::vgpr)
1792 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1793 else
1794 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1795 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1796
1797 } else if (instr->src[0].src.ssa->bit_size == 64) {
1798 if (dst.type() == RegType::vgpr)
1799 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1800 else
1801 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1802 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1803
1804 } else {
1805 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1806 nir_print_instr(&instr->instr, stderr);
1807 fprintf(stderr, "\n");
1808 }
1809 break;
1810 }
1811 case nir_op_f2u32: {
1812 Temp src = get_alu_src(ctx, instr->src[0]);
1813 if (instr->src[0].src.ssa->bit_size == 32) {
1814 if (dst.type() == RegType::vgpr)
1815 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1816 else
1817 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1818 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1819
1820 } else if (instr->src[0].src.ssa->bit_size == 64) {
1821 if (dst.type() == RegType::vgpr)
1822 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1823 else
1824 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1825 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1826
1827 } else {
1828 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1829 nir_print_instr(&instr->instr, stderr);
1830 fprintf(stderr, "\n");
1831 }
1832 break;
1833 }
1834 case nir_op_f2i64: {
1835 Temp src = get_alu_src(ctx, instr->src[0]);
1836 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1837 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1838 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1839 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1840 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1841 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1842 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1843 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1844 Temp new_exponent = bld.tmp(v1);
1845 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1846 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1847 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1848 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1849 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1850 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1851 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1852 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1853 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1854 Temp new_lower = bld.tmp(v1);
1855 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1856 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1857 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1858
1859 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1860 if (src.type() == RegType::vgpr)
1861 src = bld.as_uniform(src);
1862 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1863 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1864 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1865 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1866 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1867 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1868 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1869 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1870 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1871 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1872 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1873 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1874 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1875 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1876 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1877 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1878 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1879 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1880 Temp borrow = bld.tmp(s1);
1881 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1882 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1883 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1884
1885 } else if (instr->src[0].src.ssa->bit_size == 64) {
1886 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1887 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1888 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1889 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1890 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1891 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1892 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1893 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1894 if (dst.type() == RegType::sgpr) {
1895 lower = bld.as_uniform(lower);
1896 upper = bld.as_uniform(upper);
1897 }
1898 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1899
1900 } else {
1901 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1902 nir_print_instr(&instr->instr, stderr);
1903 fprintf(stderr, "\n");
1904 }
1905 break;
1906 }
1907 case nir_op_f2u64: {
1908 Temp src = get_alu_src(ctx, instr->src[0]);
1909 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1910 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1911 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1912 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1913 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1914 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1915 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1916 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1917 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1918 Temp new_exponent = bld.tmp(v1);
1919 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1920 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1921 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1922 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1923 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1924 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1925 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1926 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1927 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1928
1929 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1930 if (src.type() == RegType::vgpr)
1931 src = bld.as_uniform(src);
1932 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1933 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1934 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1935 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1936 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1937 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1938 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1939 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1940 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1941 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1942 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1943 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1944 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1945 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1946 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1947 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1948 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1949 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1950
1951 } else if (instr->src[0].src.ssa->bit_size == 64) {
1952 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1953 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1954 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1955 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1956 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1957 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1958 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1959 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1960 if (dst.type() == RegType::sgpr) {
1961 lower = bld.as_uniform(lower);
1962 upper = bld.as_uniform(upper);
1963 }
1964 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1965
1966 } else {
1967 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1968 nir_print_instr(&instr->instr, stderr);
1969 fprintf(stderr, "\n");
1970 }
1971 break;
1972 }
1973 case nir_op_b2f32: {
1974 Temp src = get_alu_src(ctx, instr->src[0]);
1975 assert(src.regClass() == s2);
1976
1977 if (dst.regClass() == s1) {
1978 src = bool_to_scalar_condition(ctx, src);
1979 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1980 } else if (dst.regClass() == v1) {
1981 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1982 } else {
1983 unreachable("Wrong destination register class for nir_op_b2f32.");
1984 }
1985 break;
1986 }
1987 case nir_op_b2f64: {
1988 Temp src = get_alu_src(ctx, instr->src[0]);
1989 assert(src.regClass() == s2);
1990
1991 if (dst.regClass() == s2) {
1992 src = bool_to_scalar_condition(ctx, src);
1993 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1994 } else if (dst.regClass() == v2) {
1995 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1996 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
1997 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1998 } else {
1999 unreachable("Wrong destination register class for nir_op_b2f64.");
2000 }
2001 break;
2002 }
2003 case nir_op_i2i32: {
2004 Temp src = get_alu_src(ctx, instr->src[0]);
2005 if (instr->src[0].src.ssa->bit_size == 64) {
2006 /* we can actually just say dst = src, as it would map the lower register */
2007 emit_extract_vector(ctx, src, 0, dst);
2008 } else {
2009 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2010 nir_print_instr(&instr->instr, stderr);
2011 fprintf(stderr, "\n");
2012 }
2013 break;
2014 }
2015 case nir_op_u2u32: {
2016 Temp src = get_alu_src(ctx, instr->src[0]);
2017 if (instr->src[0].src.ssa->bit_size == 16) {
2018 if (dst.regClass() == s1) {
2019 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2020 } else {
2021 // TODO: do better with SDWA
2022 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2023 }
2024 } else if (instr->src[0].src.ssa->bit_size == 64) {
2025 /* we can actually just say dst = src, as it would map the lower register */
2026 emit_extract_vector(ctx, src, 0, dst);
2027 } else {
2028 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2029 nir_print_instr(&instr->instr, stderr);
2030 fprintf(stderr, "\n");
2031 }
2032 break;
2033 }
2034 case nir_op_i2i64: {
2035 Temp src = get_alu_src(ctx, instr->src[0]);
2036 if (src.regClass() == s1) {
2037 Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
2038 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2039 } else if (src.regClass() == v1) {
2040 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2041 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2042 } else {
2043 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2044 nir_print_instr(&instr->instr, stderr);
2045 fprintf(stderr, "\n");
2046 }
2047 break;
2048 }
2049 case nir_op_u2u64: {
2050 Temp src = get_alu_src(ctx, instr->src[0]);
2051 if (instr->src[0].src.ssa->bit_size == 32) {
2052 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2053 } else {
2054 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2055 nir_print_instr(&instr->instr, stderr);
2056 fprintf(stderr, "\n");
2057 }
2058 break;
2059 }
2060 case nir_op_b2i32: {
2061 Temp src = get_alu_src(ctx, instr->src[0]);
2062 assert(src.regClass() == s2);
2063
2064 if (dst.regClass() == s1) {
2065 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2066 bool_to_scalar_condition(ctx, src, dst);
2067 } else if (dst.regClass() == v1) {
2068 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2069 } else {
2070 unreachable("Invalid register class for b2i32");
2071 }
2072 break;
2073 }
2074 case nir_op_i2b1: {
2075 Temp src = get_alu_src(ctx, instr->src[0]);
2076 assert(dst.regClass() == s2);
2077
2078 if (src.type() == RegType::vgpr) {
2079 assert(src.regClass() == v1 || src.regClass() == v2);
2080 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2081 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2082 } else {
2083 assert(src.regClass() == s1 || src.regClass() == s2);
2084 Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2085 bld.scc(bld.def(s1)), Operand(0u), src);
2086 bool_to_vector_condition(ctx, tmp, dst);
2087 }
2088 break;
2089 }
2090 case nir_op_pack_64_2x32_split: {
2091 Temp src0 = get_alu_src(ctx, instr->src[0]);
2092 Temp src1 = get_alu_src(ctx, instr->src[1]);
2093
2094 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2095 break;
2096 }
2097 case nir_op_unpack_64_2x32_split_x:
2098 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2099 break;
2100 case nir_op_unpack_64_2x32_split_y:
2101 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2102 break;
2103 case nir_op_pack_half_2x16: {
2104 Temp src = get_alu_src(ctx, instr->src[0], 2);
2105
2106 if (dst.regClass() == v1) {
2107 Temp src0 = bld.tmp(v1);
2108 Temp src1 = bld.tmp(v1);
2109 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2110 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2111 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2112 else
2113 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2114 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2115 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2116 } else {
2117 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2118 nir_print_instr(&instr->instr, stderr);
2119 fprintf(stderr, "\n");
2120 }
2121 break;
2122 }
2123 case nir_op_unpack_half_2x16_split_x: {
2124 if (dst.regClass() == v1) {
2125 Builder bld(ctx->program, ctx->block);
2126 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2127 } else {
2128 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2129 nir_print_instr(&instr->instr, stderr);
2130 fprintf(stderr, "\n");
2131 }
2132 break;
2133 }
2134 case nir_op_unpack_half_2x16_split_y: {
2135 if (dst.regClass() == v1) {
2136 Builder bld(ctx->program, ctx->block);
2137 /* TODO: use SDWA here */
2138 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2139 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2140 } else {
2141 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2142 nir_print_instr(&instr->instr, stderr);
2143 fprintf(stderr, "\n");
2144 }
2145 break;
2146 }
2147 case nir_op_fquantize2f16: {
2148 Temp src = get_alu_src(ctx, instr->src[0]);
2149 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2150
2151 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2152
2153 Temp cmp_res = bld.tmp(s2);
2154 bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2155
2156 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2157
2158 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
2159 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2160 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2161 } else {
2162 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2163 }
2164 break;
2165 }
2166 case nir_op_bfm: {
2167 Temp bits = get_alu_src(ctx, instr->src[0]);
2168 Temp offset = get_alu_src(ctx, instr->src[1]);
2169
2170 if (dst.regClass() == s1) {
2171 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2172 } else if (dst.regClass() == v1) {
2173 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2174 } else {
2175 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2176 nir_print_instr(&instr->instr, stderr);
2177 fprintf(stderr, "\n");
2178 }
2179 break;
2180 }
2181 case nir_op_bitfield_select: {
2182 /* (mask & insert) | (~mask & base) */
2183 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2184 Temp insert = get_alu_src(ctx, instr->src[1]);
2185 Temp base = get_alu_src(ctx, instr->src[2]);
2186
2187 /* dst = (insert & bitmask) | (base & ~bitmask) */
2188 if (dst.regClass() == s1) {
2189 aco_ptr<Instruction> sop2;
2190 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2191 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2192 Operand lhs;
2193 if (const_insert && const_bitmask) {
2194 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2195 } else {
2196 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2197 lhs = Operand(insert);
2198 }
2199
2200 Operand rhs;
2201 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2202 if (const_base && const_bitmask) {
2203 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2204 } else {
2205 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2206 rhs = Operand(base);
2207 }
2208
2209 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2210
2211 } else if (dst.regClass() == v1) {
2212 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2213 base = as_vgpr(ctx, base);
2214 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2215 insert = as_vgpr(ctx, insert);
2216
2217 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2218
2219 } else {
2220 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2221 nir_print_instr(&instr->instr, stderr);
2222 fprintf(stderr, "\n");
2223 }
2224 break;
2225 }
2226 case nir_op_ubfe:
2227 case nir_op_ibfe: {
2228 Temp base = get_alu_src(ctx, instr->src[0]);
2229 Temp offset = get_alu_src(ctx, instr->src[1]);
2230 Temp bits = get_alu_src(ctx, instr->src[2]);
2231
2232 if (dst.type() == RegType::sgpr) {
2233 Operand extract;
2234 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2235 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2236 if (const_offset && const_bits) {
2237 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2238 extract = Operand(const_extract);
2239 } else {
2240 Operand width;
2241 if (const_bits) {
2242 width = Operand(const_bits->u32 << 16);
2243 } else {
2244 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2245 }
2246 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2247 }
2248
2249 aco_opcode opcode;
2250 if (dst.regClass() == s1) {
2251 if (instr->op == nir_op_ubfe)
2252 opcode = aco_opcode::s_bfe_u32;
2253 else
2254 opcode = aco_opcode::s_bfe_i32;
2255 } else if (dst.regClass() == s2) {
2256 if (instr->op == nir_op_ubfe)
2257 opcode = aco_opcode::s_bfe_u64;
2258 else
2259 opcode = aco_opcode::s_bfe_i64;
2260 } else {
2261 unreachable("Unsupported BFE bit size");
2262 }
2263
2264 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2265
2266 } else {
2267 aco_opcode opcode;
2268 if (dst.regClass() == v1) {
2269 if (instr->op == nir_op_ubfe)
2270 opcode = aco_opcode::v_bfe_u32;
2271 else
2272 opcode = aco_opcode::v_bfe_i32;
2273 } else {
2274 unreachable("Unsupported BFE bit size");
2275 }
2276
2277 emit_vop3a_instruction(ctx, instr, opcode, dst);
2278 }
2279 break;
2280 }
2281 case nir_op_bit_count: {
2282 Temp src = get_alu_src(ctx, instr->src[0]);
2283 if (src.regClass() == s1) {
2284 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2285 } else if (src.regClass() == v1) {
2286 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2287 } else if (src.regClass() == v2) {
2288 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2289 emit_extract_vector(ctx, src, 1, v1),
2290 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2291 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2292 } else if (src.regClass() == s2) {
2293 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2294 } else {
2295 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2296 nir_print_instr(&instr->instr, stderr);
2297 fprintf(stderr, "\n");
2298 }
2299 break;
2300 }
2301 case nir_op_flt: {
2302 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2303 break;
2304 }
2305 case nir_op_fge: {
2306 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2307 break;
2308 }
2309 case nir_op_feq: {
2310 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2311 break;
2312 }
2313 case nir_op_fne: {
2314 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2315 break;
2316 }
2317 case nir_op_ilt: {
2318 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2319 break;
2320 }
2321 case nir_op_ige: {
2322 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2323 break;
2324 }
2325 case nir_op_ieq: {
2326 if (instr->src[0].src.ssa->bit_size == 1)
2327 emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b64, dst);
2328 else
2329 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
2330 break;
2331 }
2332 case nir_op_ine: {
2333 if (instr->src[0].src.ssa->bit_size == 1)
2334 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
2335 else
2336 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
2337 break;
2338 }
2339 case nir_op_ult: {
2340 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2341 break;
2342 }
2343 case nir_op_uge: {
2344 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2345 break;
2346 }
2347 case nir_op_fddx:
2348 case nir_op_fddy:
2349 case nir_op_fddx_fine:
2350 case nir_op_fddy_fine:
2351 case nir_op_fddx_coarse:
2352 case nir_op_fddy_coarse: {
2353 Definition tl = bld.def(v1);
2354 uint16_t dpp_ctrl;
2355 if (instr->op == nir_op_fddx_fine) {
2356 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2357 dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2358 } else if (instr->op == nir_op_fddy_fine) {
2359 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2360 dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2361 } else {
2362 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2363 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2364 dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2365 else
2366 dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2367 }
2368
2369 Definition tmp = bld.def(v1);
2370 bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2371 emit_wqm(ctx, tmp.getTemp(), dst, true);
2372 break;
2373 }
2374 default:
2375 fprintf(stderr, "Unknown NIR ALU instr: ");
2376 nir_print_instr(&instr->instr, stderr);
2377 fprintf(stderr, "\n");
2378 }
2379 }
2380
2381 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2382 {
2383 Temp dst = get_ssa_temp(ctx, &instr->def);
2384
2385 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2386 // which get truncated the lsb if double and msb if int
2387 // for now, we only use s_mov_b64 with 64bit inline constants
2388 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2389 assert(dst.type() == RegType::sgpr);
2390
2391 Builder bld(ctx->program, ctx->block);
2392
2393 if (instr->def.bit_size == 1) {
2394 assert(dst.regClass() == s2);
2395 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0)));
2396 } else if (dst.size() == 1) {
2397 bld.copy(Definition(dst), Operand(instr->value[0].u32));
2398 } else {
2399 assert(dst.size() != 1);
2400 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2401 if (instr->def.bit_size == 64)
2402 for (unsigned i = 0; i < dst.size(); i++)
2403 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2404 else {
2405 for (unsigned i = 0; i < dst.size(); i++)
2406 vec->operands[i] = Operand{instr->value[i].u32};
2407 }
2408 vec->definitions[0] = Definition(dst);
2409 ctx->block->instructions.emplace_back(std::move(vec));
2410 }
2411 }
2412
2413 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2414 {
2415 uint32_t new_mask = 0;
2416 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2417 if (mask & (1u << i))
2418 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2419 return new_mask;
2420 }
2421
2422 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2423 {
2424 /* This wouldn't work inside control flow or with indirect offsets but
2425 * that doesn't happen because of nir_lower_io_to_temporaries(). */
2426
2427 unsigned write_mask = nir_intrinsic_write_mask(instr);
2428 unsigned component = nir_intrinsic_component(instr);
2429 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2430 unsigned idx = nir_intrinsic_base(instr) + component;
2431
2432 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2433 if (off_instr->type != nir_instr_type_load_const) {
2434 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2435 nir_print_instr(off_instr, stderr);
2436 fprintf(stderr, "\n");
2437 }
2438 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2439
2440 if (instr->src[0].ssa->bit_size == 64)
2441 write_mask = widen_mask(write_mask, 2);
2442
2443 for (unsigned i = 0; i < 8; ++i) {
2444 if (write_mask & (1 << i)) {
2445 ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2446 ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2447 }
2448 idx++;
2449 }
2450 }
2451
2452 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2453 {
2454 unsigned write_mask = nir_intrinsic_write_mask(instr);
2455 Operand values[4];
2456 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2457 for (unsigned i = 0; i < 4; ++i) {
2458 if (write_mask & (1 << i)) {
2459 Temp tmp = emit_extract_vector(ctx, src, i, v1);
2460 values[i] = Operand(tmp);
2461 } else {
2462 values[i] = Operand(v1);
2463 }
2464 }
2465
2466 unsigned index = nir_intrinsic_base(instr) / 4;
2467 unsigned target, col_format;
2468 unsigned enabled_channels = 0xF;
2469 aco_opcode compr_op = (aco_opcode)0;
2470
2471 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2472 assert(offset && "Non-const offsets on exports not yet supported");
2473 index += offset->u32;
2474
2475 assert(index != FRAG_RESULT_COLOR);
2476
2477 /* Unlike vertex shader exports, it's fine to use multiple exports to
2478 * export separate channels of one target. So shaders which export both
2479 * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2480 * TODO: combine the exports in those cases and create better code
2481 */
2482
2483 if (index == FRAG_RESULT_SAMPLE_MASK) {
2484
2485 if (ctx->program->info->ps.writes_z) {
2486 target = V_008DFC_SQ_EXP_MRTZ;
2487 enabled_channels = 0x4;
2488 col_format = (unsigned) -1;
2489
2490 values[2] = values[0];
2491 values[0] = Operand(v1);
2492 } else {
2493 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2494 exp->valid_mask = false;
2495 exp->done = false;
2496 exp->compressed = true;
2497 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2498 exp->enabled_mask = 0xc;
2499 for (int i = 0; i < 4; i++)
2500 exp->operands[i] = Operand(v1);
2501 exp->operands[1] = Operand(values[0]);
2502 ctx->block->instructions.emplace_back(std::move(exp));
2503 return;
2504 }
2505
2506 } else if (index == FRAG_RESULT_DEPTH) {
2507
2508 target = V_008DFC_SQ_EXP_MRTZ;
2509 enabled_channels = 0x1;
2510 col_format = (unsigned) -1;
2511
2512 } else if (index == FRAG_RESULT_STENCIL) {
2513
2514 if (ctx->program->info->ps.writes_z) {
2515 target = V_008DFC_SQ_EXP_MRTZ;
2516 enabled_channels = 0x2;
2517 col_format = (unsigned) -1;
2518
2519 values[1] = values[0];
2520 values[0] = Operand(v1);
2521 } else {
2522 aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2523 shift->operands[0] = Operand((uint32_t) 16);
2524 shift->operands[1] = values[0];
2525 Temp tmp = {ctx->program->allocateId(), v1};
2526 shift->definitions[0] = Definition(tmp);
2527 ctx->block->instructions.emplace_back(std::move(shift));
2528
2529 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2530 exp->valid_mask = false;
2531 exp->done = false;
2532 exp->compressed = true;
2533 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2534 exp->enabled_mask = 0x3;
2535 exp->operands[0] = Operand(tmp);
2536 for (int i = 1; i < 4; i++)
2537 exp->operands[i] = Operand(v1);
2538 ctx->block->instructions.emplace_back(std::move(exp));
2539 return;
2540 }
2541
2542 } else {
2543 index -= FRAG_RESULT_DATA0;
2544 target = V_008DFC_SQ_EXP_MRT + index;
2545 col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2546 }
2547 ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2548 ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2549 assert(!is_int8 && !is_int10);
2550
2551 switch (col_format)
2552 {
2553 case V_028714_SPI_SHADER_ZERO:
2554 enabled_channels = 0; /* writemask */
2555 target = V_008DFC_SQ_EXP_NULL;
2556 break;
2557
2558 case V_028714_SPI_SHADER_32_R:
2559 enabled_channels = 1;
2560 break;
2561
2562 case V_028714_SPI_SHADER_32_GR:
2563 enabled_channels = 0x3;
2564 break;
2565
2566 case V_028714_SPI_SHADER_32_AR:
2567 if (ctx->options->chip_class >= GFX10) {
2568 /* Special case: on GFX10, the outputs are different for 32_AR */
2569 enabled_channels = 0x3;
2570 values[1] = values[3];
2571 } else {
2572 enabled_channels = 0x9;
2573 }
2574 break;
2575
2576 case V_028714_SPI_SHADER_FP16_ABGR:
2577 enabled_channels = 0x5;
2578 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2579 break;
2580
2581 case V_028714_SPI_SHADER_UNORM16_ABGR:
2582 enabled_channels = 0x5;
2583 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2584 break;
2585
2586 case V_028714_SPI_SHADER_SNORM16_ABGR:
2587 enabled_channels = 0x5;
2588 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2589 break;
2590
2591 case V_028714_SPI_SHADER_UINT16_ABGR:
2592 enabled_channels = 0x5;
2593 compr_op = aco_opcode::v_cvt_pk_u16_u32;
2594 break;
2595
2596 case V_028714_SPI_SHADER_SINT16_ABGR:
2597 enabled_channels = 0x5;
2598 compr_op = aco_opcode::v_cvt_pk_i16_i32;
2599 break;
2600
2601 case V_028714_SPI_SHADER_32_ABGR:
2602 enabled_channels = 0xF;
2603 break;
2604
2605 default:
2606 break;
2607 }
2608
2609 if (target == V_008DFC_SQ_EXP_NULL)
2610 return;
2611
2612 if ((bool)compr_op)
2613 {
2614 for (int i = 0; i < 2; i++)
2615 {
2616 /* check if at least one of the values to be compressed is enabled */
2617 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2618 if (enabled) {
2619 enabled_channels |= enabled << (i*2);
2620 aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2621 Temp tmp{ctx->program->allocateId(), v1};
2622 compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2623 compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2624 compr->definitions[0] = Definition(tmp);
2625 values[i] = Operand(tmp);
2626 ctx->block->instructions.emplace_back(std::move(compr));
2627 } else {
2628 values[i] = Operand(v1);
2629 }
2630 }
2631 }
2632
2633 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2634 exp->valid_mask = false;
2635 exp->done = false;
2636 exp->compressed = (bool) compr_op;
2637 exp->dest = target;
2638 exp->enabled_mask = enabled_channels;
2639 if ((bool) compr_op) {
2640 for (int i = 0; i < 2; i++)
2641 exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2642 exp->operands[2] = Operand(v1);
2643 exp->operands[3] = Operand(v1);
2644 } else {
2645 for (int i = 0; i < 4; i++)
2646 exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2647 }
2648
2649 ctx->block->instructions.emplace_back(std::move(exp));
2650 }
2651
2652 Operand load_lds_size_m0(isel_context *ctx)
2653 {
2654 /* TODO: m0 does not need to be initialized on GFX9+ */
2655 Builder bld(ctx->program, ctx->block);
2656 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2657 }
2658
2659 void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2660 Temp address, unsigned base_offset, unsigned align)
2661 {
2662 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2663
2664 Builder bld(ctx->program, ctx->block);
2665
2666 Operand m = load_lds_size_m0(ctx);
2667
2668 unsigned num_components = dst.size() * 4u / elem_size_bytes;
2669 unsigned bytes_read = 0;
2670 unsigned result_size = 0;
2671 unsigned total_bytes = num_components * elem_size_bytes;
2672 std::array<Temp, 4> result;
2673
2674 while (bytes_read < total_bytes) {
2675 unsigned todo = total_bytes - bytes_read;
2676 bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2677 bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2678
2679 aco_opcode op = aco_opcode::last_opcode;
2680 bool read2 = false;
2681 if (todo >= 16 && aligned16) {
2682 op = aco_opcode::ds_read_b128;
2683 todo = 16;
2684 } else if (todo >= 16 && aligned8) {
2685 op = aco_opcode::ds_read2_b64;
2686 read2 = true;
2687 todo = 16;
2688 } else if (todo >= 12 && aligned16) {
2689 op = aco_opcode::ds_read_b96;
2690 todo = 12;
2691 } else if (todo >= 8 && aligned8) {
2692 op = aco_opcode::ds_read_b64;
2693 todo = 8;
2694 } else if (todo >= 8) {
2695 op = aco_opcode::ds_read2_b32;
2696 read2 = true;
2697 todo = 8;
2698 } else if (todo >= 4) {
2699 op = aco_opcode::ds_read_b32;
2700 todo = 4;
2701 } else {
2702 assert(false);
2703 }
2704 assert(todo % elem_size_bytes == 0);
2705 unsigned num_elements = todo / elem_size_bytes;
2706 unsigned offset = base_offset + bytes_read;
2707 unsigned max_offset = read2 ? 1019 : 65535;
2708
2709 Temp address_offset = address;
2710 if (offset > max_offset) {
2711 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2712 offset = bytes_read;
2713 }
2714 assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2715
2716 Temp res;
2717 if (num_components == 1 && dst.type() == RegType::vgpr)
2718 res = dst;
2719 else
2720 res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2721
2722 if (read2)
2723 res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
2724 else
2725 res = bld.ds(op, Definition(res), address_offset, m, offset);
2726
2727 if (num_components == 1) {
2728 assert(todo == total_bytes);
2729 if (dst.type() == RegType::sgpr)
2730 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2731 return;
2732 }
2733
2734 if (dst.type() == RegType::sgpr)
2735 res = bld.as_uniform(res);
2736
2737 if (num_elements == 1) {
2738 result[result_size++] = res;
2739 } else {
2740 assert(res != dst && res.size() % num_elements == 0);
2741 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2742 split->operands[0] = Operand(res);
2743 for (unsigned i = 0; i < num_elements; i++)
2744 split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2745 ctx->block->instructions.emplace_back(std::move(split));
2746 }
2747
2748 bytes_read += todo;
2749 }
2750
2751 assert(result_size == num_components && result_size > 1);
2752 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2753 for (unsigned i = 0; i < result_size; i++)
2754 vec->operands[i] = Operand(result[i]);
2755 vec->definitions[0] = Definition(dst);
2756 ctx->block->instructions.emplace_back(std::move(vec));
2757 ctx->allocated_vec.emplace(dst.id(), result);
2758 }
2759
2760 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2761 {
2762 if (start == 0 && size == data.size())
2763 return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2764
2765 unsigned size_hint = 1;
2766 auto it = ctx->allocated_vec.find(data.id());
2767 if (it != ctx->allocated_vec.end())
2768 size_hint = it->second[0].size();
2769 if (size % size_hint || start % size_hint)
2770 size_hint = 1;
2771
2772 start /= size_hint;
2773 size /= size_hint;
2774
2775 Temp elems[size];
2776 for (unsigned i = 0; i < size; i++)
2777 elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2778
2779 if (size == 1)
2780 return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2781
2782 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2783 for (unsigned i = 0; i < size; i++)
2784 vec->operands[i] = Operand(elems[i]);
2785 Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2786 vec->definitions[0] = Definition(res);
2787 ctx->block->instructions.emplace_back(std::move(vec));
2788 return res;
2789 }
2790
2791 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2792 {
2793 Builder bld(ctx->program, ctx->block);
2794 unsigned bytes_written = 0;
2795 while (bytes_written < total_size * 4) {
2796 unsigned todo = total_size * 4 - bytes_written;
2797 bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2798 bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2799
2800 aco_opcode op = aco_opcode::last_opcode;
2801 bool write2 = false;
2802 unsigned size = 0;
2803 if (todo >= 16 && aligned16) {
2804 op = aco_opcode::ds_write_b128;
2805 size = 4;
2806 } else if (todo >= 16 && aligned8) {
2807 op = aco_opcode::ds_write2_b64;
2808 write2 = true;
2809 size = 4;
2810 } else if (todo >= 12 && aligned16) {
2811 op = aco_opcode::ds_write_b96;
2812 size = 3;
2813 } else if (todo >= 8 && aligned8) {
2814 op = aco_opcode::ds_write_b64;
2815 size = 2;
2816 } else if (todo >= 8) {
2817 op = aco_opcode::ds_write2_b32;
2818 write2 = true;
2819 size = 2;
2820 } else if (todo >= 4) {
2821 op = aco_opcode::ds_write_b32;
2822 size = 1;
2823 } else {
2824 assert(false);
2825 }
2826
2827 unsigned offset = offset0 + offset1 + bytes_written;
2828 unsigned max_offset = write2 ? 1020 : 65535;
2829 Temp address_offset = address;
2830 if (offset > max_offset) {
2831 address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2832 offset = offset1 + bytes_written;
2833 }
2834 assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2835
2836 if (write2) {
2837 Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2838 Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2839 bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
2840 } else {
2841 Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2842 bld.ds(op, address_offset, val, m, offset);
2843 }
2844
2845 bytes_written += size * 4;
2846 }
2847 }
2848
2849 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2850 Temp address, unsigned base_offset, unsigned align)
2851 {
2852 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2853
2854 Operand m = load_lds_size_m0(ctx);
2855
2856 /* we need at most two stores for 32bit variables */
2857 int start[2], count[2];
2858 u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2859 u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2860 assert(wrmask == 0);
2861
2862 /* one combined store is sufficient */
2863 if (count[0] == count[1]) {
2864 Builder bld(ctx->program, ctx->block);
2865
2866 Temp address_offset = address;
2867 if ((base_offset >> 2) + start[1] > 255) {
2868 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2869 base_offset = 0;
2870 }
2871
2872 assert(count[0] == 1);
2873 Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
2874 Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
2875 aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2876 base_offset = base_offset / elem_size_bytes;
2877 bld.ds(op, address_offset, val0, val1, m,
2878 base_offset + start[0], base_offset + start[1]);
2879 return;
2880 }
2881
2882 for (unsigned i = 0; i < 2; i++) {
2883 if (count[i] == 0)
2884 continue;
2885
2886 unsigned elem_size_words = elem_size_bytes / 4;
2887 ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2888 base_offset, start[i] * elem_size_bytes, align);
2889 }
2890 return;
2891 }
2892
2893 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2894 {
2895 if (ctx->stage == vertex_vs) {
2896 visit_store_vs_output(ctx, instr);
2897 } else if (ctx->stage == fragment_fs) {
2898 visit_store_fs_output(ctx, instr);
2899 } else {
2900 unreachable("Shader stage not implemented");
2901 }
2902 }
2903
2904 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2905 {
2906 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2907 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2908
2909 Builder bld(ctx->program, ctx->block);
2910 Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2911 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2912 }
2913
2914 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2915 {
2916 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2917 for (unsigned i = 0; i < num_components; i++)
2918 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
2919 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
2920 assert(num_components == 4);
2921 Builder bld(ctx->program, ctx->block);
2922 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
2923 }
2924
2925 for (Operand& op : vec->operands)
2926 op = op.isUndefined() ? Operand(0u) : op;
2927
2928 vec->definitions[0] = Definition(dst);
2929 ctx->block->instructions.emplace_back(std::move(vec));
2930 emit_split_vector(ctx, dst, num_components);
2931 return;
2932 }
2933
2934 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2935 {
2936 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2937 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2938 unsigned idx = nir_intrinsic_base(instr);
2939 unsigned component = nir_intrinsic_component(instr);
2940 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
2941
2942 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2943 if (offset) {
2944 assert(offset->u32 == 0);
2945 } else {
2946 /* the lower 15bit of the prim_mask contain the offset into LDS
2947 * while the upper bits contain the number of prims */
2948 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2949 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2950 Builder bld(ctx->program, ctx->block);
2951 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2952 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2953 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2954 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2955 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2956 }
2957
2958 if (instr->dest.ssa.num_components == 1) {
2959 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2960 } else {
2961 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2962 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2963 {
2964 Temp tmp = {ctx->program->allocateId(), v1};
2965 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2966 vec->operands[i] = Operand(tmp);
2967 }
2968 vec->definitions[0] = Definition(dst);
2969 ctx->block->instructions.emplace_back(std::move(vec));
2970 }
2971 }
2972
2973 unsigned get_num_channels_from_data_format(unsigned data_format)
2974 {
2975 switch (data_format) {
2976 case V_008F0C_BUF_DATA_FORMAT_8:
2977 case V_008F0C_BUF_DATA_FORMAT_16:
2978 case V_008F0C_BUF_DATA_FORMAT_32:
2979 return 1;
2980 case V_008F0C_BUF_DATA_FORMAT_8_8:
2981 case V_008F0C_BUF_DATA_FORMAT_16_16:
2982 case V_008F0C_BUF_DATA_FORMAT_32_32:
2983 return 2;
2984 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2985 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2986 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2987 return 3;
2988 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2989 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2990 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2991 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2992 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2993 return 4;
2994 default:
2995 break;
2996 }
2997
2998 return 4;
2999 }
3000
3001 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
3002 * so we may need to fix it up. */
3003 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
3004 {
3005 Builder bld(ctx->program, ctx->block);
3006
3007 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
3008 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
3009
3010 /* For the integer-like cases, do a natural sign extension.
3011 *
3012 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
3013 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
3014 * exponent.
3015 */
3016 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
3017 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
3018
3019 /* Convert back to the right type. */
3020 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
3021 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3022 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
3023 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
3024 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
3025 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3026 }
3027
3028 return alpha;
3029 }
3030
3031 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
3032 {
3033 Builder bld(ctx->program, ctx->block);
3034 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3035 if (ctx->stage & sw_vs) {
3036
3037 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3038 if (off_instr->type != nir_instr_type_load_const) {
3039 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3040 nir_print_instr(off_instr, stderr);
3041 fprintf(stderr, "\n");
3042 }
3043 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3044
3045 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
3046
3047 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3048 unsigned component = nir_intrinsic_component(instr);
3049 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3050 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3051 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3052 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3053
3054 unsigned dfmt = attrib_format & 0xf;
3055
3056 unsigned nfmt = (attrib_format >> 4) & 0x7;
3057 unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
3058 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3059 unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
3060 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3061 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3062 if (post_shuffle)
3063 num_channels = MAX2(num_channels, 3);
3064
3065 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
3066
3067 Temp index;
3068 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3069 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3070 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
3071 if (divisor) {
3072 ctx->needs_instance_id = true;
3073 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
3074 if (divisor != 1) {
3075 Temp divided = bld.tmp(v1);
3076 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
3077 index = bld.vadd32(bld.def(v1), start_instance, divided);
3078 } else {
3079 index = bld.vadd32(bld.def(v1), start_instance, instance_id);
3080 }
3081 } else {
3082 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
3083 }
3084 } else {
3085 index = bld.vadd32(bld.def(v1),
3086 get_arg(ctx, ctx->args->ac.base_vertex),
3087 get_arg(ctx, ctx->args->ac.vertex_id));
3088 }
3089
3090 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
3091 index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
3092 attrib_offset = attrib_offset % attrib_stride;
3093 }
3094
3095 Operand soffset(0u);
3096 if (attrib_offset >= 4096) {
3097 soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
3098 attrib_offset = 0;
3099 }
3100
3101 aco_opcode opcode;
3102 switch (num_channels) {
3103 case 1:
3104 opcode = aco_opcode::tbuffer_load_format_x;
3105 break;
3106 case 2:
3107 opcode = aco_opcode::tbuffer_load_format_xy;
3108 break;
3109 case 3:
3110 opcode = aco_opcode::tbuffer_load_format_xyz;
3111 break;
3112 case 4:
3113 opcode = aco_opcode::tbuffer_load_format_xyzw;
3114 break;
3115 default:
3116 unreachable("Unimplemented load_input vector size");
3117 }
3118
3119 Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
3120
3121 aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
3122 mubuf->operands[0] = Operand(index);
3123 mubuf->operands[1] = Operand(list);
3124 mubuf->operands[2] = soffset;
3125 mubuf->definitions[0] = Definition(tmp);
3126 mubuf->idxen = true;
3127 mubuf->can_reorder = true;
3128 mubuf->dfmt = dfmt;
3129 mubuf->nfmt = nfmt;
3130 assert(attrib_offset < 4096);
3131 mubuf->offset = attrib_offset;
3132 ctx->block->instructions.emplace_back(std::move(mubuf));
3133
3134 emit_split_vector(ctx, tmp, tmp.size());
3135
3136 if (tmp.id() != dst.id()) {
3137 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3138 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3139
3140 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3141 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3142 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3143
3144 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3145 for (unsigned i = 0; i < dst.size(); i++) {
3146 unsigned idx = i + component;
3147 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
3148 Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
3149 vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
3150 } else if (idx < num_channels) {
3151 vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
3152 } else if (is_float && idx == 3) {
3153 vec->operands[i] = Operand(0x3f800000u);
3154 } else if (!is_float && idx == 3) {
3155 vec->operands[i] = Operand(1u);
3156 } else {
3157 vec->operands[i] = Operand(0u);
3158 }
3159 }
3160 vec->definitions[0] = Definition(dst);
3161 ctx->block->instructions.emplace_back(std::move(vec));
3162 emit_split_vector(ctx, dst, dst.size());
3163 }
3164
3165 } else if (ctx->stage == fragment_fs) {
3166 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3167 if (off_instr->type != nir_instr_type_load_const ||
3168 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3169 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3170 nir_print_instr(off_instr, stderr);
3171 fprintf(stderr, "\n");
3172 }
3173
3174 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3175 nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
3176 if (offset) {
3177 assert(offset->u32 == 0);
3178 } else {
3179 /* the lower 15bit of the prim_mask contain the offset into LDS
3180 * while the upper bits contain the number of prims */
3181 Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
3182 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3183 Builder bld(ctx->program, ctx->block);
3184 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3185 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3186 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3187 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3188 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3189 }
3190
3191 unsigned idx = nir_intrinsic_base(instr);
3192 unsigned component = nir_intrinsic_component(instr);
3193
3194 if (dst.size() == 1) {
3195 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
3196 } else {
3197 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3198 for (unsigned i = 0; i < dst.size(); i++)
3199 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
3200 vec->definitions[0] = Definition(dst);
3201 bld.insert(std::move(vec));
3202 }
3203
3204 } else {
3205 unreachable("Shader stage not implemented");
3206 }
3207 }
3208
3209 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3210 {
3211 if (ctx->program->info->need_indirect_descriptor_sets) {
3212 Builder bld(ctx->program, ctx->block);
3213 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
3214 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
3215 }
3216
3217 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
3218 }
3219
3220
3221 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3222 {
3223 Builder bld(ctx->program, ctx->block);
3224 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3225 if (!ctx->divergent_vals[instr->dest.ssa.index])
3226 index = bld.as_uniform(index);
3227 unsigned desc_set = nir_intrinsic_desc_set(instr);
3228 unsigned binding = nir_intrinsic_binding(instr);
3229
3230 Temp desc_ptr;
3231 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3232 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3233 unsigned offset = layout->binding[binding].offset;
3234 unsigned stride;
3235 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3236 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3237 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3238 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
3239 offset = pipeline_layout->push_constant_size + 16 * idx;
3240 stride = 16;
3241 } else {
3242 desc_ptr = load_desc_ptr(ctx, desc_set);
3243 stride = layout->binding[binding].size;
3244 }
3245
3246 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3247 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3248 if (stride != 1) {
3249 if (nir_const_index) {
3250 const_index = const_index * stride;
3251 } else if (index.type() == RegType::vgpr) {
3252 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
3253 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
3254 } else {
3255 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
3256 }
3257 }
3258 if (offset) {
3259 if (nir_const_index) {
3260 const_index = const_index + offset;
3261 } else if (index.type() == RegType::vgpr) {
3262 index = bld.vadd32(bld.def(v1), Operand(offset), index);
3263 } else {
3264 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
3265 }
3266 }
3267
3268 if (nir_const_index && const_index == 0) {
3269 index = desc_ptr;
3270 } else if (index.type() == RegType::vgpr) {
3271 index = bld.vadd32(bld.def(v1),
3272 nir_const_index ? Operand(const_index) : Operand(index),
3273 Operand(desc_ptr));
3274 } else {
3275 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3276 nir_const_index ? Operand(const_index) : Operand(index),
3277 Operand(desc_ptr));
3278 }
3279
3280 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
3281 }
3282
3283 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
3284 Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
3285 {
3286 Builder bld(ctx->program, ctx->block);
3287
3288 unsigned num_bytes = dst.size() * 4;
3289 bool dlc = glc && ctx->options->chip_class >= GFX10;
3290
3291 aco_opcode op;
3292 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3293 if (ctx->options->chip_class < GFX8)
3294 offset = as_vgpr(ctx, offset);
3295
3296 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3297 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3298 unsigned const_offset = 0;
3299
3300 Temp lower = Temp();
3301 if (num_bytes > 16) {
3302 assert(num_components == 3 || num_components == 4);
3303 op = aco_opcode::buffer_load_dwordx4;
3304 lower = bld.tmp(v4);
3305 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3306 mubuf->definitions[0] = Definition(lower);
3307 mubuf->operands[0] = vaddr;
3308 mubuf->operands[1] = Operand(rsrc);
3309 mubuf->operands[2] = soffset;
3310 mubuf->offen = (offset.type() == RegType::vgpr);
3311 mubuf->glc = glc;
3312 mubuf->dlc = dlc;
3313 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3314 mubuf->can_reorder = readonly;
3315 bld.insert(std::move(mubuf));
3316 emit_split_vector(ctx, lower, 2);
3317 num_bytes -= 16;
3318 const_offset = 16;
3319 }
3320
3321 switch (num_bytes) {
3322 case 4:
3323 op = aco_opcode::buffer_load_dword;
3324 break;
3325 case 8:
3326 op = aco_opcode::buffer_load_dwordx2;
3327 break;
3328 case 12:
3329 op = aco_opcode::buffer_load_dwordx3;
3330 break;
3331 case 16:
3332 op = aco_opcode::buffer_load_dwordx4;
3333 break;
3334 default:
3335 unreachable("Load SSBO not implemented for this size.");
3336 }
3337 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3338 mubuf->operands[0] = vaddr;
3339 mubuf->operands[1] = Operand(rsrc);
3340 mubuf->operands[2] = soffset;
3341 mubuf->offen = (offset.type() == RegType::vgpr);
3342 mubuf->glc = glc;
3343 mubuf->dlc = dlc;
3344 mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3345 mubuf->can_reorder = readonly;
3346 mubuf->offset = const_offset;
3347 aco_ptr<Instruction> instr = std::move(mubuf);
3348
3349 if (dst.size() > 4) {
3350 assert(lower != Temp());
3351 Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3352 instr->definitions[0] = Definition(upper);
3353 bld.insert(std::move(instr));
3354 if (dst.size() == 8)
3355 emit_split_vector(ctx, upper, 2);
3356 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3357 instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3358 instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3359 instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3360 if (dst.size() == 8)
3361 instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3362 }
3363
3364 if (dst.type() == RegType::sgpr) {
3365 Temp vec = bld.tmp(RegType::vgpr, dst.size());
3366 instr->definitions[0] = Definition(vec);
3367 bld.insert(std::move(instr));
3368 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3369 } else {
3370 instr->definitions[0] = Definition(dst);
3371 bld.insert(std::move(instr));
3372 }
3373 } else {
3374 switch (num_bytes) {
3375 case 4:
3376 op = aco_opcode::s_buffer_load_dword;
3377 break;
3378 case 8:
3379 op = aco_opcode::s_buffer_load_dwordx2;
3380 break;
3381 case 12:
3382 case 16:
3383 op = aco_opcode::s_buffer_load_dwordx4;
3384 break;
3385 case 24:
3386 case 32:
3387 op = aco_opcode::s_buffer_load_dwordx8;
3388 break;
3389 default:
3390 unreachable("Load SSBO not implemented for this size.");
3391 }
3392 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3393 load->operands[0] = Operand(rsrc);
3394 load->operands[1] = Operand(bld.as_uniform(offset));
3395 assert(load->operands[1].getTemp().type() == RegType::sgpr);
3396 load->definitions[0] = Definition(dst);
3397 load->glc = glc;
3398 load->dlc = dlc;
3399 load->barrier = readonly ? barrier_none : barrier_buffer;
3400 load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3401 assert(ctx->options->chip_class >= GFX8 || !glc);
3402
3403 /* trim vector */
3404 if (dst.size() == 3) {
3405 Temp vec = bld.tmp(s4);
3406 load->definitions[0] = Definition(vec);
3407 bld.insert(std::move(load));
3408 emit_split_vector(ctx, vec, 4);
3409
3410 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3411 emit_extract_vector(ctx, vec, 0, s1),
3412 emit_extract_vector(ctx, vec, 1, s1),
3413 emit_extract_vector(ctx, vec, 2, s1));
3414 } else if (dst.size() == 6) {
3415 Temp vec = bld.tmp(s8);
3416 load->definitions[0] = Definition(vec);
3417 bld.insert(std::move(load));
3418 emit_split_vector(ctx, vec, 4);
3419
3420 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3421 emit_extract_vector(ctx, vec, 0, s2),
3422 emit_extract_vector(ctx, vec, 1, s2),
3423 emit_extract_vector(ctx, vec, 2, s2));
3424 } else {
3425 bld.insert(std::move(load));
3426 }
3427
3428 }
3429 emit_split_vector(ctx, dst, num_components);
3430 }
3431
3432 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3433 {
3434 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3435 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3436
3437 Builder bld(ctx->program, ctx->block);
3438
3439 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3440 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3441 unsigned binding = nir_intrinsic_binding(idx_instr);
3442 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3443
3444 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3445 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3446 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3447 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3448 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3449 if (ctx->options->chip_class >= GFX10) {
3450 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3451 S_008F0C_OOB_SELECT(3) |
3452 S_008F0C_RESOURCE_LEVEL(1);
3453 } else {
3454 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3455 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3456 }
3457 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3458 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3459 Operand(0xFFFFFFFFu),
3460 Operand(desc_type));
3461 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3462 rsrc, upper_dwords);
3463 } else {
3464 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3465 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3466 }
3467
3468 load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3469 }
3470
3471 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3472 {
3473 Builder bld(ctx->program, ctx->block);
3474 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3475
3476 unsigned offset = nir_intrinsic_base(instr);
3477 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3478 if (index_cv && instr->dest.ssa.bit_size == 32) {
3479
3480 unsigned count = instr->dest.ssa.num_components;
3481 unsigned start = (offset + index_cv->u32) / 4u;
3482 start -= ctx->args->ac.base_inline_push_consts;
3483 if (start + count <= ctx->args->ac.num_inline_push_consts) {
3484 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3485 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3486 for (unsigned i = 0; i < count; ++i) {
3487 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
3488 vec->operands[i] = Operand{elems[i]};
3489 }
3490 vec->definitions[0] = Definition(dst);
3491 ctx->block->instructions.emplace_back(std::move(vec));
3492 ctx->allocated_vec.emplace(dst.id(), elems);
3493 return;
3494 }
3495 }
3496
3497 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3498 if (offset != 0) // TODO check if index != 0 as well
3499 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3500 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
3501 Temp vec = dst;
3502 bool trim = false;
3503 aco_opcode op;
3504
3505 switch (dst.size()) {
3506 case 1:
3507 op = aco_opcode::s_load_dword;
3508 break;
3509 case 2:
3510 op = aco_opcode::s_load_dwordx2;
3511 break;
3512 case 3:
3513 vec = bld.tmp(s4);
3514 trim = true;
3515 case 4:
3516 op = aco_opcode::s_load_dwordx4;
3517 break;
3518 case 6:
3519 vec = bld.tmp(s8);
3520 trim = true;
3521 case 8:
3522 op = aco_opcode::s_load_dwordx8;
3523 break;
3524 default:
3525 unreachable("unimplemented or forbidden load_push_constant.");
3526 }
3527
3528 bld.smem(op, Definition(vec), ptr, index);
3529
3530 if (trim) {
3531 emit_split_vector(ctx, vec, 4);
3532 RegClass rc = dst.size() == 3 ? s1 : s2;
3533 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3534 emit_extract_vector(ctx, vec, 0, rc),
3535 emit_extract_vector(ctx, vec, 1, rc),
3536 emit_extract_vector(ctx, vec, 2, rc));
3537
3538 }
3539 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3540 }
3541
3542 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3543 {
3544 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3545
3546 Builder bld(ctx->program, ctx->block);
3547
3548 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3549 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3550 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3551 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3552 if (ctx->options->chip_class >= GFX10) {
3553 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3554 S_008F0C_OOB_SELECT(3) |
3555 S_008F0C_RESOURCE_LEVEL(1);
3556 } else {
3557 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3558 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3559 }
3560
3561 unsigned base = nir_intrinsic_base(instr);
3562 unsigned range = nir_intrinsic_range(instr);
3563
3564 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3565 if (base && offset.type() == RegType::sgpr)
3566 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3567 else if (base && offset.type() == RegType::vgpr)
3568 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3569
3570 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3571 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3572 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3573 Operand(desc_type));
3574
3575 load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3576 }
3577
3578 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3579 {
3580 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3581 ctx->cf_info.exec_potentially_empty = true;
3582
3583 ctx->program->needs_exact = true;
3584
3585 // TODO: optimize uniform conditions
3586 Builder bld(ctx->program, ctx->block);
3587 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3588 assert(src.regClass() == s2);
3589 src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3590 bld.pseudo(aco_opcode::p_discard_if, src);
3591 ctx->block->kind |= block_kind_uses_discard_if;
3592 return;
3593 }
3594
3595 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3596 {
3597 Builder bld(ctx->program, ctx->block);
3598
3599 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3600 ctx->cf_info.exec_potentially_empty = true;
3601
3602 bool divergent = ctx->cf_info.parent_if.is_divergent ||
3603 ctx->cf_info.parent_loop.has_divergent_continue;
3604
3605 if (ctx->block->loop_nest_depth &&
3606 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3607 /* we handle discards the same way as jump instructions */
3608 append_logical_end(ctx->block);
3609
3610 /* in loops, discard behaves like break */
3611 Block *linear_target = ctx->cf_info.parent_loop.exit;
3612 ctx->block->kind |= block_kind_discard;
3613
3614 if (!divergent) {
3615 /* uniform discard - loop ends here */
3616 assert(nir_instr_is_last(&instr->instr));
3617 ctx->block->kind |= block_kind_uniform;
3618 ctx->cf_info.has_branch = true;
3619 bld.branch(aco_opcode::p_branch);
3620 add_linear_edge(ctx->block->index, linear_target);
3621 return;
3622 }
3623
3624 /* we add a break right behind the discard() instructions */
3625 ctx->block->kind |= block_kind_break;
3626 unsigned idx = ctx->block->index;
3627
3628 /* remove critical edges from linear CFG */
3629 bld.branch(aco_opcode::p_branch);
3630 Block* break_block = ctx->program->create_and_insert_block();
3631 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3632 break_block->kind |= block_kind_uniform;
3633 add_linear_edge(idx, break_block);
3634 add_linear_edge(break_block->index, linear_target);
3635 bld.reset(break_block);
3636 bld.branch(aco_opcode::p_branch);
3637
3638 Block* continue_block = ctx->program->create_and_insert_block();
3639 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3640 add_linear_edge(idx, continue_block);
3641 append_logical_start(continue_block);
3642 ctx->block = continue_block;
3643
3644 return;
3645 }
3646
3647 /* it can currently happen that NIR doesn't remove the unreachable code */
3648 if (!nir_instr_is_last(&instr->instr)) {
3649 ctx->program->needs_exact = true;
3650 /* save exec somewhere temporarily so that it doesn't get
3651 * overwritten before the discard from outer exec masks */
3652 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3653 bld.pseudo(aco_opcode::p_discard_if, cond);
3654 ctx->block->kind |= block_kind_uses_discard_if;
3655 return;
3656 }
3657
3658 /* This condition is incorrect for uniformly branched discards in a loop
3659 * predicated by a divergent condition, but the above code catches that case
3660 * and the discard would end up turning into a discard_if.
3661 * For example:
3662 * if (divergent) {
3663 * while (...) {
3664 * if (uniform) {
3665 * discard;
3666 * }
3667 * }
3668 * }
3669 */
3670 if (!ctx->cf_info.parent_if.is_divergent) {
3671 /* program just ends here */
3672 ctx->block->kind |= block_kind_uniform;
3673 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3674 0 /* enabled mask */, 9 /* dest */,
3675 false /* compressed */, true/* done */, true /* valid mask */);
3676 bld.sopp(aco_opcode::s_endpgm);
3677 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3678 } else {
3679 ctx->block->kind |= block_kind_discard;
3680 /* branch and linear edge is added by visit_if() */
3681 }
3682 }
3683
3684 enum aco_descriptor_type {
3685 ACO_DESC_IMAGE,
3686 ACO_DESC_FMASK,
3687 ACO_DESC_SAMPLER,
3688 ACO_DESC_BUFFER,
3689 ACO_DESC_PLANE_0,
3690 ACO_DESC_PLANE_1,
3691 ACO_DESC_PLANE_2,
3692 };
3693
3694 static bool
3695 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3696 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3697 return false;
3698 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3699 return dim == ac_image_cube ||
3700 dim == ac_image_1darray ||
3701 dim == ac_image_2darray ||
3702 dim == ac_image_2darraymsaa;
3703 }
3704
3705 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3706 enum aco_descriptor_type desc_type,
3707 const nir_tex_instr *tex_instr, bool image, bool write)
3708 {
3709 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3710 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3711 if (it != ctx->tex_desc.end())
3712 return it->second;
3713 */
3714 Temp index = Temp();
3715 bool index_set = false;
3716 unsigned constant_index = 0;
3717 unsigned descriptor_set;
3718 unsigned base_index;
3719 Builder bld(ctx->program, ctx->block);
3720
3721 if (!deref_instr) {
3722 assert(tex_instr && !image);
3723 descriptor_set = 0;
3724 base_index = tex_instr->sampler_index;
3725 } else {
3726 while(deref_instr->deref_type != nir_deref_type_var) {
3727 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3728 if (!array_size)
3729 array_size = 1;
3730
3731 assert(deref_instr->deref_type == nir_deref_type_array);
3732 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3733 if (const_value) {
3734 constant_index += array_size * const_value->u32;
3735 } else {
3736 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3737 if (indirect.type() == RegType::vgpr)
3738 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3739
3740 if (array_size != 1)
3741 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3742
3743 if (!index_set) {
3744 index = indirect;
3745 index_set = true;
3746 } else {
3747 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3748 }
3749 }
3750
3751 deref_instr = nir_src_as_deref(deref_instr->parent);
3752 }
3753 descriptor_set = deref_instr->var->data.descriptor_set;
3754 base_index = deref_instr->var->data.binding;
3755 }
3756
3757 Temp list = load_desc_ptr(ctx, descriptor_set);
3758 list = convert_pointer_to_64_bit(ctx, list);
3759
3760 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3761 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3762 unsigned offset = binding->offset;
3763 unsigned stride = binding->size;
3764 aco_opcode opcode;
3765 RegClass type;
3766
3767 assert(base_index < layout->binding_count);
3768
3769 switch (desc_type) {
3770 case ACO_DESC_IMAGE:
3771 type = s8;
3772 opcode = aco_opcode::s_load_dwordx8;
3773 break;
3774 case ACO_DESC_FMASK:
3775 type = s8;
3776 opcode = aco_opcode::s_load_dwordx8;
3777 offset += 32;
3778 break;
3779 case ACO_DESC_SAMPLER:
3780 type = s4;
3781 opcode = aco_opcode::s_load_dwordx4;
3782 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3783 offset += radv_combined_image_descriptor_sampler_offset(binding);
3784 break;
3785 case ACO_DESC_BUFFER:
3786 type = s4;
3787 opcode = aco_opcode::s_load_dwordx4;
3788 break;
3789 case ACO_DESC_PLANE_0:
3790 case ACO_DESC_PLANE_1:
3791 type = s8;
3792 opcode = aco_opcode::s_load_dwordx8;
3793 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3794 break;
3795 case ACO_DESC_PLANE_2:
3796 type = s4;
3797 opcode = aco_opcode::s_load_dwordx4;
3798 offset += 64;
3799 break;
3800 default:
3801 unreachable("invalid desc_type\n");
3802 }
3803
3804 offset += constant_index * stride;
3805
3806 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3807 (!index_set || binding->immutable_samplers_equal)) {
3808 if (binding->immutable_samplers_equal)
3809 constant_index = 0;
3810
3811 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3812 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3813 Operand(samplers[constant_index * 4 + 0]),
3814 Operand(samplers[constant_index * 4 + 1]),
3815 Operand(samplers[constant_index * 4 + 2]),
3816 Operand(samplers[constant_index * 4 + 3]));
3817 }
3818
3819 Operand off;
3820 if (!index_set) {
3821 off = Operand(offset);
3822 } else {
3823 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3824 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3825 }
3826
3827 Temp res = bld.smem(opcode, bld.def(type), list, off);
3828
3829 if (desc_type == ACO_DESC_PLANE_2) {
3830 Temp components[8];
3831 for (unsigned i = 0; i < 8; i++)
3832 components[i] = bld.tmp(s1);
3833 bld.pseudo(aco_opcode::p_split_vector,
3834 Definition(components[0]),
3835 Definition(components[1]),
3836 Definition(components[2]),
3837 Definition(components[3]),
3838 res);
3839
3840 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3841 bld.pseudo(aco_opcode::p_split_vector,
3842 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3843 Definition(components[4]),
3844 Definition(components[5]),
3845 Definition(components[6]),
3846 Definition(components[7]),
3847 desc2);
3848
3849 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3850 components[0], components[1], components[2], components[3],
3851 components[4], components[5], components[6], components[7]);
3852 }
3853
3854 return res;
3855 }
3856
3857 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3858 {
3859 switch (dim) {
3860 case GLSL_SAMPLER_DIM_BUF:
3861 return 1;
3862 case GLSL_SAMPLER_DIM_1D:
3863 return array ? 2 : 1;
3864 case GLSL_SAMPLER_DIM_2D:
3865 return array ? 3 : 2;
3866 case GLSL_SAMPLER_DIM_MS:
3867 return array ? 4 : 3;
3868 case GLSL_SAMPLER_DIM_3D:
3869 case GLSL_SAMPLER_DIM_CUBE:
3870 return 3;
3871 case GLSL_SAMPLER_DIM_RECT:
3872 case GLSL_SAMPLER_DIM_SUBPASS:
3873 return 2;
3874 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3875 return 3;
3876 default:
3877 break;
3878 }
3879 return 0;
3880 }
3881
3882
3883 /* Adjust the sample index according to FMASK.
3884 *
3885 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3886 * which is the identity mapping. Each nibble says which physical sample
3887 * should be fetched to get that sample.
3888 *
3889 * For example, 0x11111100 means there are only 2 samples stored and
3890 * the second sample covers 3/4 of the pixel. When reading samples 0
3891 * and 1, return physical sample 0 (determined by the first two 0s
3892 * in FMASK), otherwise return physical sample 1.
3893 *
3894 * The sample index should be adjusted as follows:
3895 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
3896 */
3897 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3898 {
3899 Builder bld(ctx->program, ctx->block);
3900 Temp fmask = bld.tmp(v1);
3901 unsigned dim = ctx->options->chip_class >= GFX10
3902 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3903 : 0;
3904
3905 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3906 load->operands[0] = Operand(coords);
3907 load->operands[1] = Operand(fmask_desc_ptr);
3908 load->definitions[0] = Definition(fmask);
3909 load->glc = false;
3910 load->dlc = false;
3911 load->dmask = 0x1;
3912 load->unrm = true;
3913 load->da = da;
3914 load->dim = dim;
3915 load->can_reorder = true; /* fmask images shouldn't be modified */
3916 ctx->block->instructions.emplace_back(std::move(load));
3917
3918 Operand sample_index4;
3919 if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3920 sample_index4 = Operand(sample_index.constantValue() << 2);
3921 } else if (sample_index.regClass() == s1) {
3922 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3923 } else {
3924 assert(sample_index.regClass() == v1);
3925 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3926 }
3927
3928 Temp final_sample;
3929 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3930 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3931 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3932 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3933 else
3934 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3935
3936 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3937 * resource descriptor is 0 (invalid),
3938 */
3939 Temp compare = bld.tmp(s2);
3940 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3941 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3942
3943 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3944
3945 /* Replace the MSAA sample index. */
3946 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3947 }
3948
3949 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3950 {
3951
3952 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3953 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3954 bool is_array = glsl_sampler_type_is_array(type);
3955 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3956 assert(!add_frag_pos && "Input attachments should be lowered.");
3957 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3958 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3959 int count = image_type_to_components_count(dim, is_array);
3960 std::vector<Operand> coords(count);
3961
3962 if (is_ms) {
3963 Operand sample_index;
3964 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3965 if (sample_cv)
3966 sample_index = Operand(sample_cv->u32);
3967 else
3968 sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3969
3970 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3971 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3972 for (unsigned i = 0; i < vec->operands.size(); i++)
3973 vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3974 Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3975 vec->definitions[0] = Definition(fmask_load_address);
3976 ctx->block->instructions.emplace_back(std::move(vec));
3977
3978 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3979 sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3980 }
3981 count--;
3982 coords[count] = sample_index;
3983 }
3984
3985 if (count == 1 && !gfx9_1d)
3986 return emit_extract_vector(ctx, src0, 0, v1);
3987
3988 if (gfx9_1d) {
3989 coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3990 coords.resize(coords.size() + 1);
3991 coords[1] = Operand((uint32_t) 0);
3992 if (is_array)
3993 coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3994 } else {
3995 for (int i = 0; i < count; i++)
3996 coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3997 }
3998
3999 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
4000 for (unsigned i = 0; i < coords.size(); i++)
4001 vec->operands[i] = coords[i];
4002 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
4003 vec->definitions[0] = Definition(res);
4004 ctx->block->instructions.emplace_back(std::move(vec));
4005 return res;
4006 }
4007
4008
4009 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
4010 {
4011 Builder bld(ctx->program, ctx->block);
4012 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4013 const struct glsl_type *type = glsl_without_array(var->type);
4014 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4015 bool is_array = glsl_sampler_type_is_array(type);
4016 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4017
4018 if (dim == GLSL_SAMPLER_DIM_BUF) {
4019 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
4020 unsigned num_channels = util_last_bit(mask);
4021 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4022 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4023
4024 aco_opcode opcode;
4025 switch (num_channels) {
4026 case 1:
4027 opcode = aco_opcode::buffer_load_format_x;
4028 break;
4029 case 2:
4030 opcode = aco_opcode::buffer_load_format_xy;
4031 break;
4032 case 3:
4033 opcode = aco_opcode::buffer_load_format_xyz;
4034 break;
4035 case 4:
4036 opcode = aco_opcode::buffer_load_format_xyzw;
4037 break;
4038 default:
4039 unreachable(">4 channel buffer image load");
4040 }
4041 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
4042 load->operands[0] = Operand(vindex);
4043 load->operands[1] = Operand(rsrc);
4044 load->operands[2] = Operand((uint32_t) 0);
4045 Temp tmp;
4046 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4047 tmp = dst;
4048 else
4049 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4050 load->definitions[0] = Definition(tmp);
4051 load->idxen = true;
4052 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
4053 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4054 load->barrier = barrier_image;
4055 ctx->block->instructions.emplace_back(std::move(load));
4056
4057 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4058 return;
4059 }
4060
4061 Temp coords = get_image_coords(ctx, instr, type);
4062 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4063
4064 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4065 unsigned num_components = util_bitcount(dmask);
4066 Temp tmp;
4067 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4068 tmp = dst;
4069 else
4070 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4071
4072 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
4073 load->operands[0] = Operand(coords);
4074 load->operands[1] = Operand(resource);
4075 load->definitions[0] = Definition(tmp);
4076 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4077 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4078 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4079 load->dmask = dmask;
4080 load->unrm = true;
4081 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4082 load->barrier = barrier_image;
4083 ctx->block->instructions.emplace_back(std::move(load));
4084
4085 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4086 return;
4087 }
4088
4089 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4090 {
4091 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4092 const struct glsl_type *type = glsl_without_array(var->type);
4093 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4094 bool is_array = glsl_sampler_type_is_array(type);
4095 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4096
4097 bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4098
4099 if (dim == GLSL_SAMPLER_DIM_BUF) {
4100 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4101 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4102 aco_opcode opcode;
4103 switch (data.size()) {
4104 case 1:
4105 opcode = aco_opcode::buffer_store_format_x;
4106 break;
4107 case 2:
4108 opcode = aco_opcode::buffer_store_format_xy;
4109 break;
4110 case 3:
4111 opcode = aco_opcode::buffer_store_format_xyz;
4112 break;
4113 case 4:
4114 opcode = aco_opcode::buffer_store_format_xyzw;
4115 break;
4116 default:
4117 unreachable(">4 channel buffer image store");
4118 }
4119 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4120 store->operands[0] = Operand(vindex);
4121 store->operands[1] = Operand(rsrc);
4122 store->operands[2] = Operand((uint32_t) 0);
4123 store->operands[3] = Operand(data);
4124 store->idxen = true;
4125 store->glc = glc;
4126 store->dlc = false;
4127 store->disable_wqm = true;
4128 store->barrier = barrier_image;
4129 ctx->program->needs_exact = true;
4130 ctx->block->instructions.emplace_back(std::move(store));
4131 return;
4132 }
4133
4134 assert(data.type() == RegType::vgpr);
4135 Temp coords = get_image_coords(ctx, instr, type);
4136 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4137
4138 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
4139 store->operands[0] = Operand(coords);
4140 store->operands[1] = Operand(resource);
4141 store->operands[2] = Operand(s4);
4142 store->operands[3] = Operand(data);
4143 store->glc = glc;
4144 store->dlc = false;
4145 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4146 store->dmask = (1 << data.size()) - 1;
4147 store->unrm = true;
4148 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4149 store->disable_wqm = true;
4150 store->barrier = barrier_image;
4151 ctx->program->needs_exact = true;
4152 ctx->block->instructions.emplace_back(std::move(store));
4153 return;
4154 }
4155
4156 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4157 {
4158 /* return the previous value if dest is ever used */
4159 bool return_previous = false;
4160 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4161 return_previous = true;
4162 break;
4163 }
4164 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4165 return_previous = true;
4166 break;
4167 }
4168
4169 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4170 const struct glsl_type *type = glsl_without_array(var->type);
4171 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4172 bool is_array = glsl_sampler_type_is_array(type);
4173 Builder bld(ctx->program, ctx->block);
4174
4175 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4176 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4177
4178 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4179 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4180
4181 aco_opcode buf_op, image_op;
4182 switch (instr->intrinsic) {
4183 case nir_intrinsic_image_deref_atomic_add:
4184 buf_op = aco_opcode::buffer_atomic_add;
4185 image_op = aco_opcode::image_atomic_add;
4186 break;
4187 case nir_intrinsic_image_deref_atomic_umin:
4188 buf_op = aco_opcode::buffer_atomic_umin;
4189 image_op = aco_opcode::image_atomic_umin;
4190 break;
4191 case nir_intrinsic_image_deref_atomic_imin:
4192 buf_op = aco_opcode::buffer_atomic_smin;
4193 image_op = aco_opcode::image_atomic_smin;
4194 break;
4195 case nir_intrinsic_image_deref_atomic_umax:
4196 buf_op = aco_opcode::buffer_atomic_umax;
4197 image_op = aco_opcode::image_atomic_umax;
4198 break;
4199 case nir_intrinsic_image_deref_atomic_imax:
4200 buf_op = aco_opcode::buffer_atomic_smax;
4201 image_op = aco_opcode::image_atomic_smax;
4202 break;
4203 case nir_intrinsic_image_deref_atomic_and:
4204 buf_op = aco_opcode::buffer_atomic_and;
4205 image_op = aco_opcode::image_atomic_and;
4206 break;
4207 case nir_intrinsic_image_deref_atomic_or:
4208 buf_op = aco_opcode::buffer_atomic_or;
4209 image_op = aco_opcode::image_atomic_or;
4210 break;
4211 case nir_intrinsic_image_deref_atomic_xor:
4212 buf_op = aco_opcode::buffer_atomic_xor;
4213 image_op = aco_opcode::image_atomic_xor;
4214 break;
4215 case nir_intrinsic_image_deref_atomic_exchange:
4216 buf_op = aco_opcode::buffer_atomic_swap;
4217 image_op = aco_opcode::image_atomic_swap;
4218 break;
4219 case nir_intrinsic_image_deref_atomic_comp_swap:
4220 buf_op = aco_opcode::buffer_atomic_cmpswap;
4221 image_op = aco_opcode::image_atomic_cmpswap;
4222 break;
4223 default:
4224 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4225 }
4226
4227 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4228
4229 if (dim == GLSL_SAMPLER_DIM_BUF) {
4230 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4231 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4232 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
4233 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4234 mubuf->operands[0] = Operand(vindex);
4235 mubuf->operands[1] = Operand(resource);
4236 mubuf->operands[2] = Operand((uint32_t)0);
4237 mubuf->operands[3] = Operand(data);
4238 if (return_previous)
4239 mubuf->definitions[0] = Definition(dst);
4240 mubuf->offset = 0;
4241 mubuf->idxen = true;
4242 mubuf->glc = return_previous;
4243 mubuf->dlc = false; /* Not needed for atomics */
4244 mubuf->disable_wqm = true;
4245 mubuf->barrier = barrier_image;
4246 ctx->program->needs_exact = true;
4247 ctx->block->instructions.emplace_back(std::move(mubuf));
4248 return;
4249 }
4250
4251 Temp coords = get_image_coords(ctx, instr, type);
4252 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4253 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
4254 mimg->operands[0] = Operand(coords);
4255 mimg->operands[1] = Operand(resource);
4256 mimg->operands[2] = Operand(s4); /* no sampler */
4257 mimg->operands[3] = Operand(data);
4258 if (return_previous)
4259 mimg->definitions[0] = Definition(dst);
4260 mimg->glc = return_previous;
4261 mimg->dlc = false; /* Not needed for atomics */
4262 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4263 mimg->dmask = (1 << data.size()) - 1;
4264 mimg->unrm = true;
4265 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4266 mimg->disable_wqm = true;
4267 mimg->barrier = barrier_image;
4268 ctx->program->needs_exact = true;
4269 ctx->block->instructions.emplace_back(std::move(mimg));
4270 return;
4271 }
4272
4273 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
4274 {
4275 if (in_elements && ctx->options->chip_class == GFX8) {
4276 Builder bld(ctx->program, ctx->block);
4277
4278 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
4279 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
4280 stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
4281 stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
4282
4283 Temp size = emit_extract_vector(ctx, desc, 2, s1);
4284 size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
4285
4286 Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4287 res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4288 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4289
4290 // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4291 /* idea
4292 * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4293 * in case 12 (or 3?), we have to divide by 3:
4294 * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4295 * use v_mul_hi_u32 with magic number to divide
4296 * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4297 * disable v_skip
4298 * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4299 */
4300
4301 } else {
4302 emit_extract_vector(ctx, desc, 2, dst);
4303 }
4304 }
4305
4306 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4307 {
4308 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4309 const struct glsl_type *type = glsl_without_array(var->type);
4310 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4311 bool is_array = glsl_sampler_type_is_array(type);
4312 Builder bld(ctx->program, ctx->block);
4313
4314 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4315 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4316 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4317 }
4318
4319 /* LOD */
4320 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4321
4322 /* Resource */
4323 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4324
4325 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4326
4327 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4328 mimg->operands[0] = Operand(lod);
4329 mimg->operands[1] = Operand(resource);
4330 unsigned& dmask = mimg->dmask;
4331 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4332 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4333 mimg->da = glsl_sampler_type_is_array(type);
4334 mimg->can_reorder = true;
4335 Definition& def = mimg->definitions[0];
4336 ctx->block->instructions.emplace_back(std::move(mimg));
4337
4338 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4339 glsl_sampler_type_is_array(type)) {
4340
4341 assert(instr->dest.ssa.num_components == 3);
4342 Temp tmp = {ctx->program->allocateId(), v3};
4343 def = Definition(tmp);
4344 emit_split_vector(ctx, tmp, 3);
4345
4346 /* divide 3rd value by 6 by multiplying with magic number */
4347 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4348 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4349
4350 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4351 emit_extract_vector(ctx, tmp, 0, v1),
4352 emit_extract_vector(ctx, tmp, 1, v1),
4353 by_6);
4354
4355 } else if (ctx->options->chip_class == GFX9 &&
4356 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4357 glsl_sampler_type_is_array(type)) {
4358 assert(instr->dest.ssa.num_components == 2);
4359 def = Definition(dst);
4360 dmask = 0x5;
4361 } else {
4362 def = Definition(dst);
4363 }
4364
4365 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4366 }
4367
4368 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4369 {
4370 Builder bld(ctx->program, ctx->block);
4371 unsigned num_components = instr->num_components;
4372
4373 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4374 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4375 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4376
4377 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4378 load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
4379 }
4380
4381 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4382 {
4383 Builder bld(ctx->program, ctx->block);
4384 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4385 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4386 unsigned writemask = nir_intrinsic_write_mask(instr);
4387
4388 Temp offset;
4389 if (ctx->options->chip_class < GFX8)
4390 offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4391 else
4392 offset = get_ssa_temp(ctx, instr->src[2].ssa);
4393
4394 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4395 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4396
4397 bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4398 ctx->options->chip_class >= GFX8;
4399 if (smem)
4400 offset = bld.as_uniform(offset);
4401 bool smem_nonfs = smem && ctx->stage != fragment_fs;
4402
4403 while (writemask) {
4404 int start, count;
4405 u_bit_scan_consecutive_range(&writemask, &start, &count);
4406 if (count == 3 && smem) {
4407 writemask |= 1u << (start + 2);
4408 count = 2;
4409 }
4410 int num_bytes = count * elem_size_bytes;
4411
4412 if (num_bytes > 16) {
4413 assert(elem_size_bytes == 8);
4414 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4415 count = 2;
4416 num_bytes = 16;
4417 }
4418
4419 // TODO: check alignment of sub-dword stores
4420 // TODO: split 3 bytes. there is no store instruction for that
4421
4422 Temp write_data;
4423 if (count != instr->num_components) {
4424 emit_split_vector(ctx, data, instr->num_components);
4425 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4426 for (int i = 0; i < count; i++) {
4427 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4428 vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4429 }
4430 write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4431 vec->definitions[0] = Definition(write_data);
4432 ctx->block->instructions.emplace_back(std::move(vec));
4433 } else if (!smem && data.type() != RegType::vgpr) {
4434 assert(num_bytes % 4 == 0);
4435 write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4436 } else if (smem_nonfs && data.type() == RegType::vgpr) {
4437 assert(num_bytes % 4 == 0);
4438 write_data = bld.as_uniform(data);
4439 } else {
4440 write_data = data;
4441 }
4442
4443 aco_opcode vmem_op, smem_op;
4444 switch (num_bytes) {
4445 case 4:
4446 vmem_op = aco_opcode::buffer_store_dword;
4447 smem_op = aco_opcode::s_buffer_store_dword;
4448 break;
4449 case 8:
4450 vmem_op = aco_opcode::buffer_store_dwordx2;
4451 smem_op = aco_opcode::s_buffer_store_dwordx2;
4452 break;
4453 case 12:
4454 vmem_op = aco_opcode::buffer_store_dwordx3;
4455 smem_op = aco_opcode::last_opcode;
4456 assert(!smem);
4457 break;
4458 case 16:
4459 vmem_op = aco_opcode::buffer_store_dwordx4;
4460 smem_op = aco_opcode::s_buffer_store_dwordx4;
4461 break;
4462 default:
4463 unreachable("Store SSBO not implemented for this size.");
4464 }
4465 if (ctx->stage == fragment_fs)
4466 smem_op = aco_opcode::p_fs_buffer_store_smem;
4467
4468 if (smem) {
4469 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4470 store->operands[0] = Operand(rsrc);
4471 if (start) {
4472 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4473 offset, Operand(start * elem_size_bytes));
4474 store->operands[1] = Operand(off);
4475 } else {
4476 store->operands[1] = Operand(offset);
4477 }
4478 if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4479 store->operands[1].setFixed(m0);
4480 store->operands[2] = Operand(write_data);
4481 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4482 store->dlc = false;
4483 store->disable_wqm = true;
4484 store->barrier = barrier_buffer;
4485 ctx->block->instructions.emplace_back(std::move(store));
4486 ctx->program->wb_smem_l1_on_end = true;
4487 if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4488 ctx->block->kind |= block_kind_needs_lowering;
4489 ctx->program->needs_exact = true;
4490 }
4491 } else {
4492 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4493 store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4494 store->operands[1] = Operand(rsrc);
4495 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4496 store->operands[3] = Operand(write_data);
4497 store->offset = start * elem_size_bytes;
4498 store->offen = (offset.type() == RegType::vgpr);
4499 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4500 store->dlc = false;
4501 store->disable_wqm = true;
4502 store->barrier = barrier_buffer;
4503 ctx->program->needs_exact = true;
4504 ctx->block->instructions.emplace_back(std::move(store));
4505 }
4506 }
4507 }
4508
4509 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4510 {
4511 /* return the previous value if dest is ever used */
4512 bool return_previous = false;
4513 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4514 return_previous = true;
4515 break;
4516 }
4517 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4518 return_previous = true;
4519 break;
4520 }
4521
4522 Builder bld(ctx->program, ctx->block);
4523 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4524
4525 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4526 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4527 get_ssa_temp(ctx, instr->src[3].ssa), data);
4528
4529 Temp offset;
4530 if (ctx->options->chip_class < GFX8)
4531 offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4532 else
4533 offset = get_ssa_temp(ctx, instr->src[1].ssa);
4534
4535 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4536 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4537
4538 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4539
4540 aco_opcode op32, op64;
4541 switch (instr->intrinsic) {
4542 case nir_intrinsic_ssbo_atomic_add:
4543 op32 = aco_opcode::buffer_atomic_add;
4544 op64 = aco_opcode::buffer_atomic_add_x2;
4545 break;
4546 case nir_intrinsic_ssbo_atomic_imin:
4547 op32 = aco_opcode::buffer_atomic_smin;
4548 op64 = aco_opcode::buffer_atomic_smin_x2;
4549 break;
4550 case nir_intrinsic_ssbo_atomic_umin:
4551 op32 = aco_opcode::buffer_atomic_umin;
4552 op64 = aco_opcode::buffer_atomic_umin_x2;
4553 break;
4554 case nir_intrinsic_ssbo_atomic_imax:
4555 op32 = aco_opcode::buffer_atomic_smax;
4556 op64 = aco_opcode::buffer_atomic_smax_x2;
4557 break;
4558 case nir_intrinsic_ssbo_atomic_umax:
4559 op32 = aco_opcode::buffer_atomic_umax;
4560 op64 = aco_opcode::buffer_atomic_umax_x2;
4561 break;
4562 case nir_intrinsic_ssbo_atomic_and:
4563 op32 = aco_opcode::buffer_atomic_and;
4564 op64 = aco_opcode::buffer_atomic_and_x2;
4565 break;
4566 case nir_intrinsic_ssbo_atomic_or:
4567 op32 = aco_opcode::buffer_atomic_or;
4568 op64 = aco_opcode::buffer_atomic_or_x2;
4569 break;
4570 case nir_intrinsic_ssbo_atomic_xor:
4571 op32 = aco_opcode::buffer_atomic_xor;
4572 op64 = aco_opcode::buffer_atomic_xor_x2;
4573 break;
4574 case nir_intrinsic_ssbo_atomic_exchange:
4575 op32 = aco_opcode::buffer_atomic_swap;
4576 op64 = aco_opcode::buffer_atomic_swap_x2;
4577 break;
4578 case nir_intrinsic_ssbo_atomic_comp_swap:
4579 op32 = aco_opcode::buffer_atomic_cmpswap;
4580 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4581 break;
4582 default:
4583 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4584 }
4585 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4586 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4587 mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4588 mubuf->operands[1] = Operand(rsrc);
4589 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4590 mubuf->operands[3] = Operand(data);
4591 if (return_previous)
4592 mubuf->definitions[0] = Definition(dst);
4593 mubuf->offset = 0;
4594 mubuf->offen = (offset.type() == RegType::vgpr);
4595 mubuf->glc = return_previous;
4596 mubuf->dlc = false; /* Not needed for atomics */
4597 mubuf->disable_wqm = true;
4598 mubuf->barrier = barrier_buffer;
4599 ctx->program->needs_exact = true;
4600 ctx->block->instructions.emplace_back(std::move(mubuf));
4601 }
4602
4603 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4604
4605 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4606 Builder bld(ctx->program, ctx->block);
4607 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4608 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4609 }
4610
4611 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4612 {
4613 Builder bld(ctx->program, ctx->block);
4614 unsigned num_components = instr->num_components;
4615 unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4616
4617 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4618 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4619
4620 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4621 bool dlc = glc && ctx->options->chip_class >= GFX10;
4622 aco_opcode op;
4623 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4624 bool global = ctx->options->chip_class >= GFX9;
4625 aco_opcode op;
4626 switch (num_bytes) {
4627 case 4:
4628 op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4629 break;
4630 case 8:
4631 op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4632 break;
4633 case 12:
4634 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4635 break;
4636 case 16:
4637 op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4638 break;
4639 default:
4640 unreachable("load_global not implemented for this size.");
4641 }
4642 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4643 flat->operands[0] = Operand(addr);
4644 flat->operands[1] = Operand(s1);
4645 flat->glc = glc;
4646 flat->dlc = dlc;
4647 flat->barrier = barrier_buffer;
4648
4649 if (dst.type() == RegType::sgpr) {
4650 Temp vec = bld.tmp(RegType::vgpr, dst.size());
4651 flat->definitions[0] = Definition(vec);
4652 ctx->block->instructions.emplace_back(std::move(flat));
4653 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4654 } else {
4655 flat->definitions[0] = Definition(dst);
4656 ctx->block->instructions.emplace_back(std::move(flat));
4657 }
4658 emit_split_vector(ctx, dst, num_components);
4659 } else {
4660 switch (num_bytes) {
4661 case 4:
4662 op = aco_opcode::s_load_dword;
4663 break;
4664 case 8:
4665 op = aco_opcode::s_load_dwordx2;
4666 break;
4667 case 12:
4668 case 16:
4669 op = aco_opcode::s_load_dwordx4;
4670 break;
4671 default:
4672 unreachable("load_global not implemented for this size.");
4673 }
4674 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4675 load->operands[0] = Operand(addr);
4676 load->operands[1] = Operand(0u);
4677 load->definitions[0] = Definition(dst);
4678 load->glc = glc;
4679 load->dlc = dlc;
4680 load->barrier = barrier_buffer;
4681 assert(ctx->options->chip_class >= GFX8 || !glc);
4682
4683 if (dst.size() == 3) {
4684 /* trim vector */
4685 Temp vec = bld.tmp(s4);
4686 load->definitions[0] = Definition(vec);
4687 ctx->block->instructions.emplace_back(std::move(load));
4688 emit_split_vector(ctx, vec, 4);
4689
4690 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4691 emit_extract_vector(ctx, vec, 0, s1),
4692 emit_extract_vector(ctx, vec, 1, s1),
4693 emit_extract_vector(ctx, vec, 2, s1));
4694 } else {
4695 ctx->block->instructions.emplace_back(std::move(load));
4696 }
4697 }
4698 }
4699
4700 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4701 {
4702 Builder bld(ctx->program, ctx->block);
4703 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4704
4705 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4706 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4707
4708 unsigned writemask = nir_intrinsic_write_mask(instr);
4709 while (writemask) {
4710 int start, count;
4711 u_bit_scan_consecutive_range(&writemask, &start, &count);
4712 unsigned num_bytes = count * elem_size_bytes;
4713
4714 Temp write_data = data;
4715 if (count != instr->num_components) {
4716 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4717 for (int i = 0; i < count; i++)
4718 vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4719 write_data = bld.tmp(RegType::vgpr, count);
4720 vec->definitions[0] = Definition(write_data);
4721 ctx->block->instructions.emplace_back(std::move(vec));
4722 }
4723
4724 unsigned offset = start * elem_size_bytes;
4725 if (offset > 0 && ctx->options->chip_class < GFX9) {
4726 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4727 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4728 Temp carry = bld.tmp(s2);
4729 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4730
4731 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4732 Operand(offset), addr0);
4733 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4734 Operand(0u), addr1,
4735 carry).def(1).setHint(vcc);
4736
4737 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4738
4739 offset = 0;
4740 }
4741
4742 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4743 bool global = ctx->options->chip_class >= GFX9;
4744 aco_opcode op;
4745 switch (num_bytes) {
4746 case 4:
4747 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4748 break;
4749 case 8:
4750 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4751 break;
4752 case 12:
4753 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4754 break;
4755 case 16:
4756 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4757 break;
4758 default:
4759 unreachable("store_global not implemented for this size.");
4760 }
4761 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4762 flat->operands[0] = Operand(addr);
4763 flat->operands[1] = Operand(s1);
4764 flat->operands[2] = Operand(data);
4765 flat->glc = glc;
4766 flat->dlc = false;
4767 flat->offset = offset;
4768 flat->disable_wqm = true;
4769 flat->barrier = barrier_buffer;
4770 ctx->program->needs_exact = true;
4771 ctx->block->instructions.emplace_back(std::move(flat));
4772 }
4773 }
4774
4775 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4776 {
4777 /* return the previous value if dest is ever used */
4778 bool return_previous = false;
4779 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4780 return_previous = true;
4781 break;
4782 }
4783 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4784 return_previous = true;
4785 break;
4786 }
4787
4788 Builder bld(ctx->program, ctx->block);
4789 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4790 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4791
4792 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
4793 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4794 get_ssa_temp(ctx, instr->src[2].ssa), data);
4795
4796 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4797
4798 bool global = ctx->options->chip_class >= GFX9;
4799 aco_opcode op32, op64;
4800 switch (instr->intrinsic) {
4801 case nir_intrinsic_global_atomic_add:
4802 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
4803 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
4804 break;
4805 case nir_intrinsic_global_atomic_imin:
4806 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
4807 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
4808 break;
4809 case nir_intrinsic_global_atomic_umin:
4810 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
4811 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
4812 break;
4813 case nir_intrinsic_global_atomic_imax:
4814 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
4815 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
4816 break;
4817 case nir_intrinsic_global_atomic_umax:
4818 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
4819 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
4820 break;
4821 case nir_intrinsic_global_atomic_and:
4822 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
4823 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
4824 break;
4825 case nir_intrinsic_global_atomic_or:
4826 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
4827 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
4828 break;
4829 case nir_intrinsic_global_atomic_xor:
4830 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
4831 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
4832 break;
4833 case nir_intrinsic_global_atomic_exchange:
4834 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
4835 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
4836 break;
4837 case nir_intrinsic_global_atomic_comp_swap:
4838 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
4839 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
4840 break;
4841 default:
4842 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
4843 }
4844 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4845 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
4846 flat->operands[0] = Operand(addr);
4847 flat->operands[1] = Operand(s1);
4848 flat->operands[2] = Operand(data);
4849 if (return_previous)
4850 flat->definitions[0] = Definition(dst);
4851 flat->glc = return_previous;
4852 flat->dlc = false; /* Not needed for atomics */
4853 flat->offset = 0;
4854 flat->disable_wqm = true;
4855 flat->barrier = barrier_buffer;
4856 ctx->program->needs_exact = true;
4857 ctx->block->instructions.emplace_back(std::move(flat));
4858 }
4859
4860 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4861 Builder bld(ctx->program, ctx->block);
4862 switch(instr->intrinsic) {
4863 case nir_intrinsic_group_memory_barrier:
4864 case nir_intrinsic_memory_barrier:
4865 bld.barrier(aco_opcode::p_memory_barrier_all);
4866 break;
4867 case nir_intrinsic_memory_barrier_atomic_counter:
4868 bld.barrier(aco_opcode::p_memory_barrier_atomic);
4869 break;
4870 case nir_intrinsic_memory_barrier_buffer:
4871 bld.barrier(aco_opcode::p_memory_barrier_buffer);
4872 break;
4873 case nir_intrinsic_memory_barrier_image:
4874 bld.barrier(aco_opcode::p_memory_barrier_image);
4875 break;
4876 case nir_intrinsic_memory_barrier_shared:
4877 bld.barrier(aco_opcode::p_memory_barrier_shared);
4878 break;
4879 default:
4880 unreachable("Unimplemented memory barrier intrinsic");
4881 break;
4882 }
4883 }
4884
4885 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4886 {
4887 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4888 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4889 assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4890 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4891 Builder bld(ctx->program, ctx->block);
4892
4893 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4894 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4895 load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
4896 }
4897
4898 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4899 {
4900 unsigned writemask = nir_intrinsic_write_mask(instr);
4901 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4902 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4903 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4904 assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4905
4906 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4907 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
4908 }
4909
4910 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4911 {
4912 unsigned offset = nir_intrinsic_base(instr);
4913 Operand m = load_lds_size_m0(ctx);
4914 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4915 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4916
4917 unsigned num_operands = 3;
4918 aco_opcode op32, op64, op32_rtn, op64_rtn;
4919 switch(instr->intrinsic) {
4920 case nir_intrinsic_shared_atomic_add:
4921 op32 = aco_opcode::ds_add_u32;
4922 op64 = aco_opcode::ds_add_u64;
4923 op32_rtn = aco_opcode::ds_add_rtn_u32;
4924 op64_rtn = aco_opcode::ds_add_rtn_u64;
4925 break;
4926 case nir_intrinsic_shared_atomic_imin:
4927 op32 = aco_opcode::ds_min_i32;
4928 op64 = aco_opcode::ds_min_i64;
4929 op32_rtn = aco_opcode::ds_min_rtn_i32;
4930 op64_rtn = aco_opcode::ds_min_rtn_i64;
4931 break;
4932 case nir_intrinsic_shared_atomic_umin:
4933 op32 = aco_opcode::ds_min_u32;
4934 op64 = aco_opcode::ds_min_u64;
4935 op32_rtn = aco_opcode::ds_min_rtn_u32;
4936 op64_rtn = aco_opcode::ds_min_rtn_u64;
4937 break;
4938 case nir_intrinsic_shared_atomic_imax:
4939 op32 = aco_opcode::ds_max_i32;
4940 op64 = aco_opcode::ds_max_i64;
4941 op32_rtn = aco_opcode::ds_max_rtn_i32;
4942 op64_rtn = aco_opcode::ds_max_rtn_i64;
4943 break;
4944 case nir_intrinsic_shared_atomic_umax:
4945 op32 = aco_opcode::ds_max_u32;
4946 op64 = aco_opcode::ds_max_u64;
4947 op32_rtn = aco_opcode::ds_max_rtn_u32;
4948 op64_rtn = aco_opcode::ds_max_rtn_u64;
4949 break;
4950 case nir_intrinsic_shared_atomic_and:
4951 op32 = aco_opcode::ds_and_b32;
4952 op64 = aco_opcode::ds_and_b64;
4953 op32_rtn = aco_opcode::ds_and_rtn_b32;
4954 op64_rtn = aco_opcode::ds_and_rtn_b64;
4955 break;
4956 case nir_intrinsic_shared_atomic_or:
4957 op32 = aco_opcode::ds_or_b32;
4958 op64 = aco_opcode::ds_or_b64;
4959 op32_rtn = aco_opcode::ds_or_rtn_b32;
4960 op64_rtn = aco_opcode::ds_or_rtn_b64;
4961 break;
4962 case nir_intrinsic_shared_atomic_xor:
4963 op32 = aco_opcode::ds_xor_b32;
4964 op64 = aco_opcode::ds_xor_b64;
4965 op32_rtn = aco_opcode::ds_xor_rtn_b32;
4966 op64_rtn = aco_opcode::ds_xor_rtn_b64;
4967 break;
4968 case nir_intrinsic_shared_atomic_exchange:
4969 op32 = aco_opcode::ds_write_b32;
4970 op64 = aco_opcode::ds_write_b64;
4971 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4972 op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4973 break;
4974 case nir_intrinsic_shared_atomic_comp_swap:
4975 op32 = aco_opcode::ds_cmpst_b32;
4976 op64 = aco_opcode::ds_cmpst_b64;
4977 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4978 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4979 num_operands = 4;
4980 break;
4981 default:
4982 unreachable("Unhandled shared atomic intrinsic");
4983 }
4984
4985 /* return the previous value if dest is ever used */
4986 bool return_previous = false;
4987 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4988 return_previous = true;
4989 break;
4990 }
4991 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4992 return_previous = true;
4993 break;
4994 }
4995
4996 aco_opcode op;
4997 if (data.size() == 1) {
4998 assert(instr->dest.ssa.bit_size == 32);
4999 op = return_previous ? op32_rtn : op32;
5000 } else {
5001 assert(instr->dest.ssa.bit_size == 64);
5002 op = return_previous ? op64_rtn : op64;
5003 }
5004
5005 if (offset > 65535) {
5006 Builder bld(ctx->program, ctx->block);
5007 address = bld.vadd32(bld.def(v1), Operand(offset), address);
5008 offset = 0;
5009 }
5010
5011 aco_ptr<DS_instruction> ds;
5012 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
5013 ds->operands[0] = Operand(address);
5014 ds->operands[1] = Operand(data);
5015 if (num_operands == 4)
5016 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
5017 ds->operands[num_operands - 1] = m;
5018 ds->offset0 = offset;
5019 if (return_previous)
5020 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
5021 ctx->block->instructions.emplace_back(std::move(ds));
5022 }
5023
5024 Temp get_scratch_resource(isel_context *ctx)
5025 {
5026 Builder bld(ctx->program, ctx->block);
5027 Temp scratch_addr = ctx->program->private_segment_buffer;
5028 if (ctx->stage != compute_cs)
5029 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
5030
5031 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
5032 S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
5033
5034 if (ctx->program->chip_class >= GFX10) {
5035 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5036 S_008F0C_OOB_SELECT(3) |
5037 S_008F0C_RESOURCE_LEVEL(1);
5038 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
5039 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5040 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5041 }
5042
5043 /* older generations need element size = 16 bytes. element size removed in GFX9 */
5044 if (ctx->program->chip_class <= GFX8)
5045 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
5046
5047 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
5048 }
5049
5050 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5051 assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
5052 Builder bld(ctx->program, ctx->block);
5053 Temp rsrc = get_scratch_resource(ctx);
5054 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5055 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5056
5057 aco_opcode op;
5058 switch (dst.size()) {
5059 case 1:
5060 op = aco_opcode::buffer_load_dword;
5061 break;
5062 case 2:
5063 op = aco_opcode::buffer_load_dwordx2;
5064 break;
5065 case 3:
5066 op = aco_opcode::buffer_load_dwordx3;
5067 break;
5068 case 4:
5069 op = aco_opcode::buffer_load_dwordx4;
5070 break;
5071 case 6:
5072 case 8: {
5073 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5074 Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
5075 bld.def(v4), offset, rsrc,
5076 ctx->program->scratch_offset, 0, true);
5077 Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
5078 aco_opcode::buffer_load_dwordx4,
5079 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
5080 offset, rsrc, ctx->program->scratch_offset, 16, true);
5081 emit_split_vector(ctx, lower, 2);
5082 elems[0] = emit_extract_vector(ctx, lower, 0, v2);
5083 elems[1] = emit_extract_vector(ctx, lower, 1, v2);
5084 if (dst.size() == 8) {
5085 emit_split_vector(ctx, upper, 2);
5086 elems[2] = emit_extract_vector(ctx, upper, 0, v2);
5087 elems[3] = emit_extract_vector(ctx, upper, 1, v2);
5088 } else {
5089 elems[2] = upper;
5090 }
5091
5092 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
5093 Format::PSEUDO, dst.size() / 2, 1)};
5094 for (unsigned i = 0; i < dst.size() / 2; i++)
5095 vec->operands[i] = Operand(elems[i]);
5096 vec->definitions[0] = Definition(dst);
5097 bld.insert(std::move(vec));
5098 ctx->allocated_vec.emplace(dst.id(), elems);
5099 return;
5100 }
5101 default:
5102 unreachable("Wrong dst size for nir_intrinsic_load_scratch");
5103 }
5104
5105 bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
5106 emit_split_vector(ctx, dst, instr->num_components);
5107 }
5108
5109 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5110 assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
5111 Builder bld(ctx->program, ctx->block);
5112 Temp rsrc = get_scratch_resource(ctx);
5113 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5114 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5115
5116 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5117 unsigned writemask = nir_intrinsic_write_mask(instr);
5118
5119 while (writemask) {
5120 int start, count;
5121 u_bit_scan_consecutive_range(&writemask, &start, &count);
5122 int num_bytes = count * elem_size_bytes;
5123
5124 if (num_bytes > 16) {
5125 assert(elem_size_bytes == 8);
5126 writemask |= (((count - 2) << 1) - 1) << (start + 2);
5127 count = 2;
5128 num_bytes = 16;
5129 }
5130
5131 // TODO: check alignment of sub-dword stores
5132 // TODO: split 3 bytes. there is no store instruction for that
5133
5134 Temp write_data;
5135 if (count != instr->num_components) {
5136 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5137 for (int i = 0; i < count; i++) {
5138 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
5139 vec->operands[i] = Operand(elem);
5140 }
5141 write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
5142 vec->definitions[0] = Definition(write_data);
5143 ctx->block->instructions.emplace_back(std::move(vec));
5144 } else {
5145 write_data = data;
5146 }
5147
5148 aco_opcode op;
5149 switch (num_bytes) {
5150 case 4:
5151 op = aco_opcode::buffer_store_dword;
5152 break;
5153 case 8:
5154 op = aco_opcode::buffer_store_dwordx2;
5155 break;
5156 case 12:
5157 op = aco_opcode::buffer_store_dwordx3;
5158 break;
5159 case 16:
5160 op = aco_opcode::buffer_store_dwordx4;
5161 break;
5162 default:
5163 unreachable("Invalid data size for nir_intrinsic_store_scratch.");
5164 }
5165
5166 bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
5167 }
5168 }
5169
5170 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
5171 uint8_t log2_ps_iter_samples;
5172 if (ctx->program->info->ps.force_persample) {
5173 log2_ps_iter_samples =
5174 util_logbase2(ctx->options->key.fs.num_samples);
5175 } else {
5176 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
5177 }
5178
5179 /* The bit pattern matches that used by fixed function fragment
5180 * processing. */
5181 static const unsigned ps_iter_masks[] = {
5182 0xffff, /* not used */
5183 0x5555,
5184 0x1111,
5185 0x0101,
5186 0x0001,
5187 };
5188 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
5189
5190 Builder bld(ctx->program, ctx->block);
5191
5192 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5193 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5194 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
5195 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
5196 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5197 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
5198 }
5199
5200 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
5201 {
5202 Builder bld(ctx->program, ctx->block);
5203
5204 if (cluster_size == 1) {
5205 return src;
5206 } if (op == nir_op_iand && cluster_size == 4) {
5207 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5208 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5209 return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
5210 bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
5211 } else if (op == nir_op_ior && cluster_size == 4) {
5212 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5213 return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5214 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5215 } else if (op == nir_op_iand && cluster_size == 64) {
5216 //subgroupAnd(val) -> (exec & ~val) == 0
5217 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5218 return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
5219 } else if (op == nir_op_ior && cluster_size == 64) {
5220 //subgroupOr(val) -> (val & exec) != 0
5221 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5222 return bool_to_vector_condition(ctx, tmp);
5223 } else if (op == nir_op_ixor && cluster_size == 64) {
5224 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5225 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5226 tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s1), bld.def(s1, scc), tmp);
5227 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5228 return bool_to_vector_condition(ctx, tmp);
5229 } else {
5230 //subgroupClustered{And,Or,Xor}(val, n) ->
5231 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5232 //cluster_offset = ~(n - 1) & lane_id
5233 //cluster_mask = ((1 << n) - 1)
5234 //subgroupClusteredAnd():
5235 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5236 //subgroupClusteredOr():
5237 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
5238 //subgroupClusteredXor():
5239 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5240 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5241 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5242 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5243
5244 Temp tmp;
5245 if (op == nir_op_iand)
5246 tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5247 else
5248 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5249
5250 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5251 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5252 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5253 if (cluster_mask != 0xffffffff)
5254 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5255
5256 Definition cmp_def = Definition();
5257 if (op == nir_op_iand) {
5258 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5259 } else if (op == nir_op_ior) {
5260 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5261 } else if (op == nir_op_ixor) {
5262 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5263 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5264 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5265 }
5266 cmp_def.setHint(vcc);
5267 return cmp_def.getTemp();
5268 }
5269 }
5270
5271 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5272 {
5273 Builder bld(ctx->program, ctx->block);
5274
5275 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5276 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5277 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5278 Temp tmp;
5279 if (op == nir_op_iand)
5280 tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5281 else
5282 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5283
5284 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5285 Temp lo = lohi.def(0).getTemp();
5286 Temp hi = lohi.def(1).getTemp();
5287 Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5288 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5289
5290 Definition cmp_def = Definition();
5291 if (op == nir_op_iand)
5292 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5293 else if (op == nir_op_ior)
5294 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5295 else if (op == nir_op_ixor)
5296 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5297 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5298 cmp_def.setHint(vcc);
5299 return cmp_def.getTemp();
5300 }
5301
5302 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5303 {
5304 Builder bld(ctx->program, ctx->block);
5305
5306 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5307 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5308 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5309 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5310 if (op == nir_op_iand)
5311 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5312 else if (op == nir_op_ior)
5313 return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5314 else if (op == nir_op_ixor)
5315 return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5316
5317 assert(false);
5318 return Temp();
5319 }
5320
5321 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5322 {
5323 Builder bld(ctx->program, ctx->block);
5324 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5325 if (src.regClass().type() == RegType::vgpr) {
5326 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5327 } else if (src.regClass() == s1) {
5328 bld.sop1(aco_opcode::s_mov_b32, dst, src);
5329 } else if (src.regClass() == s2) {
5330 bld.sop1(aco_opcode::s_mov_b64, dst, src);
5331 } else {
5332 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5333 nir_print_instr(&instr->instr, stderr);
5334 fprintf(stderr, "\n");
5335 }
5336 }
5337
5338 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5339 {
5340 Builder bld(ctx->program, ctx->block);
5341 Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
5342 Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
5343 Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
5344
5345 /* Build DD X/Y */
5346 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5347 Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5348 Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5349 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5350 Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5351 Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5352
5353 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5354 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5355 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5356 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5357 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5358 Temp wqm1 = bld.tmp(v1);
5359 emit_wqm(ctx, tmp1, wqm1, true);
5360 Temp wqm2 = bld.tmp(v1);
5361 emit_wqm(ctx, tmp2, wqm2, true);
5362 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5363 return;
5364 }
5365
5366 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5367 {
5368 Builder bld(ctx->program, ctx->block);
5369 switch(instr->intrinsic) {
5370 case nir_intrinsic_load_barycentric_sample:
5371 case nir_intrinsic_load_barycentric_pixel:
5372 case nir_intrinsic_load_barycentric_centroid: {
5373 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5374 Temp bary = Temp(0, s2);
5375 switch (mode) {
5376 case INTERP_MODE_SMOOTH:
5377 case INTERP_MODE_NONE:
5378 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5379 bary = get_arg(ctx, ctx->args->ac.persp_center);
5380 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5381 bary = ctx->persp_centroid;
5382 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5383 bary = get_arg(ctx, ctx->args->ac.persp_sample);
5384 break;
5385 case INTERP_MODE_NOPERSPECTIVE:
5386 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5387 bary = get_arg(ctx, ctx->args->ac.linear_center);
5388 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5389 bary = ctx->linear_centroid;
5390 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5391 bary = get_arg(ctx, ctx->args->ac.linear_sample);
5392 break;
5393 default:
5394 break;
5395 }
5396 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5397 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
5398 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
5399 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5400 Operand(p1), Operand(p2));
5401 emit_split_vector(ctx, dst, 2);
5402 break;
5403 }
5404 case nir_intrinsic_load_barycentric_at_sample: {
5405 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5406 switch (ctx->options->key.fs.num_samples) {
5407 case 2: sample_pos_offset += 1 << 3; break;
5408 case 4: sample_pos_offset += 3 << 3; break;
5409 case 8: sample_pos_offset += 7 << 3; break;
5410 default: break;
5411 }
5412 Temp sample_pos;
5413 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5414 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5415 Temp private_segment_buffer = ctx->program->private_segment_buffer;
5416 if (addr.type() == RegType::sgpr) {
5417 Operand offset;
5418 if (const_addr) {
5419 sample_pos_offset += const_addr->u32 << 3;
5420 offset = Operand(sample_pos_offset);
5421 } else if (ctx->options->chip_class >= GFX9) {
5422 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5423 } else {
5424 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5425 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5426 }
5427 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset));
5428
5429 } else if (ctx->options->chip_class >= GFX9) {
5430 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5431 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
5432 } else {
5433 /* addr += private_segment_buffer + sample_pos_offset */
5434 Temp tmp0 = bld.tmp(s1);
5435 Temp tmp1 = bld.tmp(s1);
5436 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
5437 Definition scc_tmp = bld.def(s1, scc);
5438 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5439 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
5440 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5441 Temp pck0 = bld.tmp(v1);
5442 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5443 tmp1 = as_vgpr(ctx, tmp1);
5444 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5445 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5446
5447 /* sample_pos = flat_load_dwordx2 addr */
5448 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5449 }
5450
5451 /* sample_pos -= 0.5 */
5452 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5453 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5454 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5455 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5456 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5457
5458 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5459 break;
5460 }
5461 case nir_intrinsic_load_barycentric_at_offset: {
5462 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5463 RegClass rc = RegClass(offset.type(), 1);
5464 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5465 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5466 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5467 break;
5468 }
5469 case nir_intrinsic_load_front_face: {
5470 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5471 Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
5472 break;
5473 }
5474 case nir_intrinsic_load_view_index:
5475 case nir_intrinsic_load_layer_id: {
5476 if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5477 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5478 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
5479 break;
5480 }
5481
5482 unsigned idx = nir_intrinsic_base(instr);
5483 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5484 Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
5485 break;
5486 }
5487 case nir_intrinsic_load_frag_coord: {
5488 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5489 break;
5490 }
5491 case nir_intrinsic_load_sample_pos: {
5492 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
5493 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
5494 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5495 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5496 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5497 break;
5498 }
5499 case nir_intrinsic_load_interpolated_input:
5500 visit_load_interpolated_input(ctx, instr);
5501 break;
5502 case nir_intrinsic_store_output:
5503 visit_store_output(ctx, instr);
5504 break;
5505 case nir_intrinsic_load_input:
5506 visit_load_input(ctx, instr);
5507 break;
5508 case nir_intrinsic_load_ubo:
5509 visit_load_ubo(ctx, instr);
5510 break;
5511 case nir_intrinsic_load_push_constant:
5512 visit_load_push_constant(ctx, instr);
5513 break;
5514 case nir_intrinsic_load_constant:
5515 visit_load_constant(ctx, instr);
5516 break;
5517 case nir_intrinsic_vulkan_resource_index:
5518 visit_load_resource(ctx, instr);
5519 break;
5520 case nir_intrinsic_discard:
5521 visit_discard(ctx, instr);
5522 break;
5523 case nir_intrinsic_discard_if:
5524 visit_discard_if(ctx, instr);
5525 break;
5526 case nir_intrinsic_load_shared:
5527 visit_load_shared(ctx, instr);
5528 break;
5529 case nir_intrinsic_store_shared:
5530 visit_store_shared(ctx, instr);
5531 break;
5532 case nir_intrinsic_shared_atomic_add:
5533 case nir_intrinsic_shared_atomic_imin:
5534 case nir_intrinsic_shared_atomic_umin:
5535 case nir_intrinsic_shared_atomic_imax:
5536 case nir_intrinsic_shared_atomic_umax:
5537 case nir_intrinsic_shared_atomic_and:
5538 case nir_intrinsic_shared_atomic_or:
5539 case nir_intrinsic_shared_atomic_xor:
5540 case nir_intrinsic_shared_atomic_exchange:
5541 case nir_intrinsic_shared_atomic_comp_swap:
5542 visit_shared_atomic(ctx, instr);
5543 break;
5544 case nir_intrinsic_image_deref_load:
5545 visit_image_load(ctx, instr);
5546 break;
5547 case nir_intrinsic_image_deref_store:
5548 visit_image_store(ctx, instr);
5549 break;
5550 case nir_intrinsic_image_deref_atomic_add:
5551 case nir_intrinsic_image_deref_atomic_umin:
5552 case nir_intrinsic_image_deref_atomic_imin:
5553 case nir_intrinsic_image_deref_atomic_umax:
5554 case nir_intrinsic_image_deref_atomic_imax:
5555 case nir_intrinsic_image_deref_atomic_and:
5556 case nir_intrinsic_image_deref_atomic_or:
5557 case nir_intrinsic_image_deref_atomic_xor:
5558 case nir_intrinsic_image_deref_atomic_exchange:
5559 case nir_intrinsic_image_deref_atomic_comp_swap:
5560 visit_image_atomic(ctx, instr);
5561 break;
5562 case nir_intrinsic_image_deref_size:
5563 visit_image_size(ctx, instr);
5564 break;
5565 case nir_intrinsic_load_ssbo:
5566 visit_load_ssbo(ctx, instr);
5567 break;
5568 case nir_intrinsic_store_ssbo:
5569 visit_store_ssbo(ctx, instr);
5570 break;
5571 case nir_intrinsic_load_global:
5572 visit_load_global(ctx, instr);
5573 break;
5574 case nir_intrinsic_store_global:
5575 visit_store_global(ctx, instr);
5576 break;
5577 case nir_intrinsic_global_atomic_add:
5578 case nir_intrinsic_global_atomic_imin:
5579 case nir_intrinsic_global_atomic_umin:
5580 case nir_intrinsic_global_atomic_imax:
5581 case nir_intrinsic_global_atomic_umax:
5582 case nir_intrinsic_global_atomic_and:
5583 case nir_intrinsic_global_atomic_or:
5584 case nir_intrinsic_global_atomic_xor:
5585 case nir_intrinsic_global_atomic_exchange:
5586 case nir_intrinsic_global_atomic_comp_swap:
5587 visit_global_atomic(ctx, instr);
5588 break;
5589 case nir_intrinsic_ssbo_atomic_add:
5590 case nir_intrinsic_ssbo_atomic_imin:
5591 case nir_intrinsic_ssbo_atomic_umin:
5592 case nir_intrinsic_ssbo_atomic_imax:
5593 case nir_intrinsic_ssbo_atomic_umax:
5594 case nir_intrinsic_ssbo_atomic_and:
5595 case nir_intrinsic_ssbo_atomic_or:
5596 case nir_intrinsic_ssbo_atomic_xor:
5597 case nir_intrinsic_ssbo_atomic_exchange:
5598 case nir_intrinsic_ssbo_atomic_comp_swap:
5599 visit_atomic_ssbo(ctx, instr);
5600 break;
5601 case nir_intrinsic_load_scratch:
5602 visit_load_scratch(ctx, instr);
5603 break;
5604 case nir_intrinsic_store_scratch:
5605 visit_store_scratch(ctx, instr);
5606 break;
5607 case nir_intrinsic_get_buffer_size:
5608 visit_get_buffer_size(ctx, instr);
5609 break;
5610 case nir_intrinsic_barrier: {
5611 unsigned* bsize = ctx->program->info->cs.block_size;
5612 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5613 if (workgroup_size > 64)
5614 bld.sopp(aco_opcode::s_barrier);
5615 break;
5616 }
5617 case nir_intrinsic_group_memory_barrier:
5618 case nir_intrinsic_memory_barrier:
5619 case nir_intrinsic_memory_barrier_atomic_counter:
5620 case nir_intrinsic_memory_barrier_buffer:
5621 case nir_intrinsic_memory_barrier_image:
5622 case nir_intrinsic_memory_barrier_shared:
5623 emit_memory_barrier(ctx, instr);
5624 break;
5625 case nir_intrinsic_load_num_work_groups: {
5626 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5627 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
5628 emit_split_vector(ctx, dst, 3);
5629 break;
5630 }
5631 case nir_intrinsic_load_local_invocation_id: {
5632 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5633 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
5634 emit_split_vector(ctx, dst, 3);
5635 break;
5636 }
5637 case nir_intrinsic_load_work_group_id: {
5638 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5639 struct ac_arg *args = ctx->args->ac.workgroup_ids;
5640 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5641 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
5642 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
5643 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
5644 emit_split_vector(ctx, dst, 3);
5645 break;
5646 }
5647 case nir_intrinsic_load_local_invocation_index: {
5648 Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5649 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5650 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
5651 get_arg(ctx, ctx->args->ac.tg_size));
5652 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5653 break;
5654 }
5655 case nir_intrinsic_load_subgroup_id: {
5656 if (ctx->stage == compute_cs) {
5657 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
5658 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
5659 } else {
5660 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5661 }
5662 break;
5663 }
5664 case nir_intrinsic_load_subgroup_invocation: {
5665 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5666 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5667 break;
5668 }
5669 case nir_intrinsic_load_num_subgroups: {
5670 if (ctx->stage == compute_cs)
5671 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
5672 get_arg(ctx, ctx->args->ac.tg_size));
5673 else
5674 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5675 break;
5676 }
5677 case nir_intrinsic_ballot: {
5678 Definition tmp = bld.def(s2);
5679 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5680 if (instr->src[0].ssa->bit_size == 1) {
5681 assert(src.regClass() == s2);
5682 bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5683 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5684 bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5685 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5686 bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5687 } else {
5688 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5689 nir_print_instr(&instr->instr, stderr);
5690 fprintf(stderr, "\n");
5691 }
5692 emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5693 break;
5694 }
5695 case nir_intrinsic_shuffle:
5696 case nir_intrinsic_read_invocation: {
5697 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5698 if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
5699 emit_uniform_subgroup(ctx, instr, src);
5700 } else {
5701 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5702 if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
5703 tid = bld.as_uniform(tid);
5704 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5705 if (src.regClass() == v1) {
5706 emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
5707 } else if (src.regClass() == v2) {
5708 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5709 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5710 lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
5711 hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
5712 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5713 emit_split_vector(ctx, dst, 2);
5714 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
5715 assert(src.regClass() == s2);
5716 Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid);
5717 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5718 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
5719 assert(src.regClass() == s2);
5720 Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5721 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5722 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5723 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5724 } else {
5725 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5726 nir_print_instr(&instr->instr, stderr);
5727 fprintf(stderr, "\n");
5728 }
5729 }
5730 break;
5731 }
5732 case nir_intrinsic_load_sample_id: {
5733 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5734 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5735 break;
5736 }
5737 case nir_intrinsic_load_sample_mask_in: {
5738 visit_load_sample_mask_in(ctx, instr);
5739 break;
5740 }
5741 case nir_intrinsic_read_first_invocation: {
5742 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5743 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5744 if (src.regClass() == v1) {
5745 emit_wqm(ctx,
5746 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5747 dst);
5748 } else if (src.regClass() == v2) {
5749 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5750 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5751 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5752 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5753 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5754 emit_split_vector(ctx, dst, 2);
5755 } else if (instr->dest.ssa.bit_size == 1) {
5756 assert(src.regClass() == s2);
5757 Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5758 bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)));
5759 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5760 } else if (src.regClass() == s1) {
5761 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5762 } else if (src.regClass() == s2) {
5763 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5764 } else {
5765 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5766 nir_print_instr(&instr->instr, stderr);
5767 fprintf(stderr, "\n");
5768 }
5769 break;
5770 }
5771 case nir_intrinsic_vote_all: {
5772 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5773 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5774 assert(src.regClass() == s2);
5775 assert(dst.regClass() == s2);
5776
5777 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5778 Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
5779 emit_wqm(ctx, val, dst);
5780 break;
5781 }
5782 case nir_intrinsic_vote_any: {
5783 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5784 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5785 assert(src.regClass() == s2);
5786 assert(dst.regClass() == s2);
5787
5788 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5789 Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp));
5790 emit_wqm(ctx, val, dst);
5791 break;
5792 }
5793 case nir_intrinsic_reduce:
5794 case nir_intrinsic_inclusive_scan:
5795 case nir_intrinsic_exclusive_scan: {
5796 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5797 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5798 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5799 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5800 nir_intrinsic_cluster_size(instr) : 0;
5801 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5802
5803 if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5804 emit_uniform_subgroup(ctx, instr, src);
5805 } else if (instr->dest.ssa.bit_size == 1) {
5806 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5807 op = nir_op_iand;
5808 else if (op == nir_op_iadd)
5809 op = nir_op_ixor;
5810 else if (op == nir_op_umax || op == nir_op_imax)
5811 op = nir_op_ior;
5812 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5813
5814 switch (instr->intrinsic) {
5815 case nir_intrinsic_reduce:
5816 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5817 break;
5818 case nir_intrinsic_exclusive_scan:
5819 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5820 break;
5821 case nir_intrinsic_inclusive_scan:
5822 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5823 break;
5824 default:
5825 assert(false);
5826 }
5827 } else if (cluster_size == 1) {
5828 bld.copy(Definition(dst), src);
5829 } else {
5830 src = as_vgpr(ctx, src);
5831
5832 ReduceOp reduce_op;
5833 switch (op) {
5834 #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5835 CASE(iadd)
5836 CASE(imul)
5837 CASE(fadd)
5838 CASE(fmul)
5839 CASE(imin)
5840 CASE(umin)
5841 CASE(fmin)
5842 CASE(imax)
5843 CASE(umax)
5844 CASE(fmax)
5845 CASE(iand)
5846 CASE(ior)
5847 CASE(ixor)
5848 default:
5849 unreachable("unknown reduction op");
5850 #undef CASE
5851 }
5852
5853 aco_opcode aco_op;
5854 switch (instr->intrinsic) {
5855 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5856 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5857 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5858 default:
5859 unreachable("unknown reduce intrinsic");
5860 }
5861
5862 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5863 reduce->operands[0] = Operand(src);
5864 // filled in by aco_reduce_assign.cpp, used internally as part of the
5865 // reduce sequence
5866 assert(dst.size() == 1 || dst.size() == 2);
5867 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5868 reduce->operands[2] = Operand(v1.as_linear());
5869
5870 Temp tmp_dst = bld.tmp(dst.regClass());
5871 reduce->definitions[0] = Definition(tmp_dst);
5872 reduce->definitions[1] = bld.def(s2); // used internally
5873 reduce->definitions[2] = Definition();
5874 reduce->definitions[3] = Definition(scc, s1);
5875 reduce->definitions[4] = Definition();
5876 reduce->reduce_op = reduce_op;
5877 reduce->cluster_size = cluster_size;
5878 ctx->block->instructions.emplace_back(std::move(reduce));
5879
5880 emit_wqm(ctx, tmp_dst, dst);
5881 }
5882 break;
5883 }
5884 case nir_intrinsic_quad_broadcast: {
5885 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5886 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5887 emit_uniform_subgroup(ctx, instr, src);
5888 } else {
5889 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5890 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5891 if (instr->dest.ssa.bit_size == 1) {
5892 assert(src.regClass() == s2);
5893 uint32_t half_mask = 0x11111111u << lane;
5894 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5895 Temp tmp = bld.tmp(s2);
5896 bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5897 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5898 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5899 emit_wqm(ctx, tmp, dst);
5900 } else if (instr->dest.ssa.bit_size == 32) {
5901 emit_wqm(ctx,
5902 bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5903 dpp_quad_perm(lane, lane, lane, lane)),
5904 dst);
5905 } else if (instr->dest.ssa.bit_size == 64) {
5906 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5907 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5908 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5909 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5910 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5911 emit_split_vector(ctx, dst, 2);
5912 } else {
5913 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5914 nir_print_instr(&instr->instr, stderr);
5915 fprintf(stderr, "\n");
5916 }
5917 }
5918 break;
5919 }
5920 case nir_intrinsic_quad_swap_horizontal:
5921 case nir_intrinsic_quad_swap_vertical:
5922 case nir_intrinsic_quad_swap_diagonal:
5923 case nir_intrinsic_quad_swizzle_amd: {
5924 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5925 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5926 emit_uniform_subgroup(ctx, instr, src);
5927 break;
5928 }
5929 uint16_t dpp_ctrl = 0;
5930 switch (instr->intrinsic) {
5931 case nir_intrinsic_quad_swap_horizontal:
5932 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5933 break;
5934 case nir_intrinsic_quad_swap_vertical:
5935 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5936 break;
5937 case nir_intrinsic_quad_swap_diagonal:
5938 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5939 break;
5940 case nir_intrinsic_quad_swizzle_amd: {
5941 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5942 break;
5943 }
5944 default:
5945 break;
5946 }
5947
5948 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5949 if (instr->dest.ssa.bit_size == 1) {
5950 assert(src.regClass() == s2);
5951 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5952 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5953 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5954 emit_wqm(ctx, tmp, dst);
5955 } else if (instr->dest.ssa.bit_size == 32) {
5956 Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5957 emit_wqm(ctx, tmp, dst);
5958 } else if (instr->dest.ssa.bit_size == 64) {
5959 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5960 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5961 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5962 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5963 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5964 emit_split_vector(ctx, dst, 2);
5965 } else {
5966 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5967 nir_print_instr(&instr->instr, stderr);
5968 fprintf(stderr, "\n");
5969 }
5970 break;
5971 }
5972 case nir_intrinsic_masked_swizzle_amd: {
5973 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5974 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5975 emit_uniform_subgroup(ctx, instr, src);
5976 break;
5977 }
5978 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5979 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5980 if (dst.regClass() == v1) {
5981 emit_wqm(ctx,
5982 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5983 dst);
5984 } else if (dst.regClass() == v2) {
5985 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5986 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5987 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5988 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5989 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5990 emit_split_vector(ctx, dst, 2);
5991 } else {
5992 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5993 nir_print_instr(&instr->instr, stderr);
5994 fprintf(stderr, "\n");
5995 }
5996 break;
5997 }
5998 case nir_intrinsic_write_invocation_amd: {
5999 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6000 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6001 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
6002 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6003 if (dst.regClass() == v1) {
6004 /* src2 is ignored for writelane. RA assigns the same reg for dst */
6005 emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
6006 } else if (dst.regClass() == v2) {
6007 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
6008 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
6009 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
6010 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
6011 Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
6012 Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
6013 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6014 emit_split_vector(ctx, dst, 2);
6015 } else {
6016 fprintf(stderr, "Unimplemented NIR instr bit size: ");
6017 nir_print_instr(&instr->instr, stderr);
6018 fprintf(stderr, "\n");
6019 }
6020 break;
6021 }
6022 case nir_intrinsic_mbcnt_amd: {
6023 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6024 RegClass rc = RegClass(src.type(), 1);
6025 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
6026 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
6027 Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
6028 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6029 Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
6030 emit_wqm(ctx, wqm_tmp, dst);
6031 break;
6032 }
6033 case nir_intrinsic_load_helper_invocation: {
6034 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6035 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
6036 ctx->block->kind |= block_kind_needs_lowering;
6037 ctx->program->needs_exact = true;
6038 break;
6039 }
6040 case nir_intrinsic_is_helper_invocation: {
6041 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6042 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
6043 ctx->block->kind |= block_kind_needs_lowering;
6044 ctx->program->needs_exact = true;
6045 break;
6046 }
6047 case nir_intrinsic_demote:
6048 bld.pseudo(aco_opcode::p_demote_to_helper);
6049 ctx->block->kind |= block_kind_uses_demote;
6050 ctx->program->needs_exact = true;
6051 break;
6052 case nir_intrinsic_demote_if: {
6053 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6054 assert(src.regClass() == s2);
6055 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
6056 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
6057 ctx->block->kind |= block_kind_uses_demote;
6058 ctx->program->needs_exact = true;
6059 break;
6060 }
6061 case nir_intrinsic_first_invocation: {
6062 emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
6063 get_ssa_temp(ctx, &instr->dest.ssa));
6064 break;
6065 }
6066 case nir_intrinsic_shader_clock:
6067 bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
6068 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
6069 break;
6070 case nir_intrinsic_load_vertex_id_zero_base: {
6071 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6072 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
6073 break;
6074 }
6075 case nir_intrinsic_load_first_vertex: {
6076 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6077 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
6078 break;
6079 }
6080 case nir_intrinsic_load_base_instance: {
6081 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6082 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
6083 break;
6084 }
6085 case nir_intrinsic_load_instance_id: {
6086 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6087 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
6088 break;
6089 }
6090 case nir_intrinsic_load_draw_id: {
6091 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6092 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
6093 break;
6094 }
6095 default:
6096 fprintf(stderr, "Unimplemented intrinsic instr: ");
6097 nir_print_instr(&instr->instr, stderr);
6098 fprintf(stderr, "\n");
6099 abort();
6100
6101 break;
6102 }
6103 }
6104
6105
6106 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
6107 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
6108 enum glsl_base_type *stype)
6109 {
6110 nir_deref_instr *texture_deref_instr = NULL;
6111 nir_deref_instr *sampler_deref_instr = NULL;
6112 int plane = -1;
6113
6114 for (unsigned i = 0; i < instr->num_srcs; i++) {
6115 switch (instr->src[i].src_type) {
6116 case nir_tex_src_texture_deref:
6117 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
6118 break;
6119 case nir_tex_src_sampler_deref:
6120 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
6121 break;
6122 case nir_tex_src_plane:
6123 plane = nir_src_as_int(instr->src[i].src);
6124 break;
6125 default:
6126 break;
6127 }
6128 }
6129
6130 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
6131
6132 if (!sampler_deref_instr)
6133 sampler_deref_instr = texture_deref_instr;
6134
6135 if (plane >= 0) {
6136 assert(instr->op != nir_texop_txf_ms &&
6137 instr->op != nir_texop_samples_identical);
6138 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
6139 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
6140 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6141 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
6142 } else {
6143 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
6144 }
6145 if (samp_ptr) {
6146 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
6147 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
6148 fprintf(stderr, "Unimplemented sampler descriptor: ");
6149 nir_print_instr(&instr->instr, stderr);
6150 fprintf(stderr, "\n");
6151 abort();
6152 // TODO: build samp_ptr = and(samp_ptr, res_ptr)
6153 }
6154 }
6155 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
6156 instr->op == nir_texop_samples_identical))
6157 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
6158 }
6159
6160 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
6161 Temp *out_ma, Temp *out_sc, Temp *out_tc)
6162 {
6163 Builder bld(ctx->program, ctx->block);
6164
6165 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
6166 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
6167 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
6168
6169 Operand neg_one(0xbf800000u);
6170 Operand one(0x3f800000u);
6171 Operand two(0x40000000u);
6172 Operand four(0x40800000u);
6173
6174 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
6175 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
6176 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
6177
6178 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
6179 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
6180 is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
6181 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
6182
6183 // select sc
6184 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
6185 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
6186 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
6187 one, is_ma_y);
6188 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6189
6190 // select tc
6191 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
6192 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
6193 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6194
6195 // select ma
6196 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6197 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
6198 deriv_z, is_ma_z);
6199 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
6200 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
6201 }
6202
6203 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
6204 {
6205 Builder bld(ctx->program, ctx->block);
6206 Temp coord_args[4], ma, tc, sc, id;
6207 for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
6208 coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
6209
6210 if (is_array) {
6211 coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
6212
6213 // see comment in ac_prepare_cube_coords()
6214 if (ctx->options->chip_class <= GFX8)
6215 coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
6216 }
6217
6218 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6219
6220 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6221 vop3a->operands[0] = Operand(ma);
6222 vop3a->abs[0] = true;
6223 Temp invma = bld.tmp(v1);
6224 vop3a->definitions[0] = Definition(invma);
6225 ctx->block->instructions.emplace_back(std::move(vop3a));
6226
6227 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6228 if (!is_deriv)
6229 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6230
6231 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6232 if (!is_deriv)
6233 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6234
6235 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6236
6237 if (is_deriv) {
6238 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6239 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6240
6241 for (unsigned i = 0; i < 2; i++) {
6242 // see comment in ac_prepare_cube_coords()
6243 Temp deriv_ma;
6244 Temp deriv_sc, deriv_tc;
6245 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6246 &deriv_ma, &deriv_sc, &deriv_tc);
6247
6248 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6249
6250 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6251 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6252 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6253 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6254 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6255 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6256 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6257 }
6258
6259 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6260 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6261 }
6262
6263 if (is_array)
6264 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6265 *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6266
6267 }
6268
6269 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6270 {
6271 Temp coord_vec[3];
6272 for (unsigned i = 0; i < coords.size(); i++)
6273 coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6274
6275 Builder bld(ctx->program, ctx->block);
6276 coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6277
6278 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6279 for (unsigned i = 0; i < coords.size(); i++)
6280 vec->operands[i] = Operand(coord_vec[i]);
6281 Temp res = bld.tmp(RegType::vgpr, coords.size());
6282 vec->definitions[0] = Definition(res);
6283 ctx->block->instructions.emplace_back(std::move(vec));
6284 return res;
6285 }
6286
6287 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6288 {
6289 if (vec->parent_instr->type != nir_instr_type_alu)
6290 return;
6291 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6292 if (vec_instr->op != nir_op_vec(vec->num_components))
6293 return;
6294
6295 for (unsigned i = 0; i < vec->num_components; i++) {
6296 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6297 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6298 }
6299 }
6300
6301 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6302 {
6303 Builder bld(ctx->program, ctx->block);
6304 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6305 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6306 Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6307 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6308 nir_const_value *sample_index_cv = NULL;
6309 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6310 enum glsl_base_type stype;
6311 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6312
6313 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6314 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6315 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6316 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6317
6318 for (unsigned i = 0; i < instr->num_srcs; i++) {
6319 switch (instr->src[i].src_type) {
6320 case nir_tex_src_coord:
6321 coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6322 break;
6323 case nir_tex_src_bias:
6324 if (instr->op == nir_texop_txb) {
6325 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6326 has_bias = true;
6327 }
6328 break;
6329 case nir_tex_src_lod: {
6330 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6331
6332 if (val && val->f32 <= 0.0) {
6333 level_zero = true;
6334 } else {
6335 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6336 has_lod = true;
6337 }
6338 break;
6339 }
6340 case nir_tex_src_comparator:
6341 if (instr->is_shadow) {
6342 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6343 has_compare = true;
6344 }
6345 break;
6346 case nir_tex_src_offset:
6347 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6348 get_const_vec(instr->src[i].src.ssa, const_offset);
6349 has_offset = true;
6350 break;
6351 case nir_tex_src_ddx:
6352 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6353 has_ddx = true;
6354 break;
6355 case nir_tex_src_ddy:
6356 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6357 has_ddy = true;
6358 break;
6359 case nir_tex_src_ms_index:
6360 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6361 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6362 has_sample_index = true;
6363 break;
6364 case nir_tex_src_texture_offset:
6365 case nir_tex_src_sampler_offset:
6366 default:
6367 break;
6368 }
6369 }
6370 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6371 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6372 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6373
6374 if (instr->op == nir_texop_texture_samples) {
6375 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6376
6377 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6378 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6379 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6380 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6381
6382 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6383 samples, Operand(1u), bld.scc(is_msaa));
6384 return;
6385 }
6386
6387 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6388 aco_ptr<Instruction> tmp_instr;
6389 Temp acc, pack = Temp();
6390
6391 uint32_t pack_const = 0;
6392 for (unsigned i = 0; i < offset.size(); i++) {
6393 if (!const_offset[i])
6394 continue;
6395 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6396 }
6397
6398 if (offset.type() == RegType::sgpr) {
6399 for (unsigned i = 0; i < offset.size(); i++) {
6400 if (const_offset[i])
6401 continue;
6402
6403 acc = emit_extract_vector(ctx, offset, i, s1);
6404 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6405
6406 if (i) {
6407 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6408 }
6409
6410 if (pack == Temp()) {
6411 pack = acc;
6412 } else {
6413 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6414 }
6415 }
6416
6417 if (pack_const && pack != Temp())
6418 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6419 } else {
6420 for (unsigned i = 0; i < offset.size(); i++) {
6421 if (const_offset[i])
6422 continue;
6423
6424 acc = emit_extract_vector(ctx, offset, i, v1);
6425 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6426
6427 if (i) {
6428 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6429 }
6430
6431 if (pack == Temp()) {
6432 pack = acc;
6433 } else {
6434 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6435 }
6436 }
6437
6438 if (pack_const && pack != Temp())
6439 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6440 }
6441 if (pack_const && pack == Temp())
6442 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6443 else if (pack == Temp())
6444 has_offset = false;
6445 else
6446 offset = pack;
6447 }
6448
6449 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6450 prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6451
6452 /* pack derivatives */
6453 if (has_ddx || has_ddy) {
6454 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6455 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6456 ddx, Operand(0u), ddy, Operand(0u));
6457 } else {
6458 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6459 }
6460 has_derivs = true;
6461 }
6462
6463 if (instr->coord_components > 1 &&
6464 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6465 instr->is_array &&
6466 instr->op != nir_texop_txf)
6467 coords = apply_round_slice(ctx, coords, 1);
6468
6469 if (instr->coord_components > 2 &&
6470 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6471 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6472 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6473 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6474 instr->is_array &&
6475 instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6476 coords = apply_round_slice(ctx, coords, 2);
6477
6478 if (ctx->options->chip_class == GFX9 &&
6479 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6480 instr->op != nir_texop_lod && instr->coord_components) {
6481 assert(coords.size() > 0 && coords.size() < 3);
6482
6483 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6484 vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6485 vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6486 if (coords.size() > 1)
6487 vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6488 coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6489 vec->definitions[0] = Definition(coords);
6490 ctx->block->instructions.emplace_back(std::move(vec));
6491 }
6492
6493 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6494
6495 if (instr->op == nir_texop_samples_identical)
6496 resource = fmask_ptr;
6497
6498 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6499 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6500 instr->op != nir_texop_txs) {
6501 assert(has_sample_index);
6502 Operand op(sample_index);
6503 if (sample_index_cv)
6504 op = Operand(sample_index_cv->u32);
6505 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6506 }
6507
6508 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6509 Temp split_coords[coords.size()];
6510 emit_split_vector(ctx, coords, coords.size());
6511 for (unsigned i = 0; i < coords.size(); i++)
6512 split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6513
6514 unsigned i = 0;
6515 for (; i < std::min(offset.size(), instr->coord_components); i++) {
6516 Temp off = emit_extract_vector(ctx, offset, i, v1);
6517 split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6518 }
6519
6520 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6521 for (unsigned i = 0; i < coords.size(); i++)
6522 vec->operands[i] = Operand(split_coords[i]);
6523 coords = bld.tmp(coords.regClass());
6524 vec->definitions[0] = Definition(coords);
6525 ctx->block->instructions.emplace_back(std::move(vec));
6526
6527 has_offset = false;
6528 }
6529
6530 /* Build tex instruction */
6531 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6532 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6533 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6534 : 0;
6535 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6536 Temp tmp_dst = dst;
6537
6538 /* gather4 selects the component by dmask and always returns vec4 */
6539 if (instr->op == nir_texop_tg4) {
6540 assert(instr->dest.ssa.num_components == 4);
6541 if (instr->is_shadow)
6542 dmask = 1;
6543 else
6544 dmask = 1 << instr->component;
6545 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6546 tmp_dst = bld.tmp(v4);
6547 } else if (instr->op == nir_texop_samples_identical) {
6548 tmp_dst = bld.tmp(v1);
6549 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6550 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6551 }
6552
6553 aco_ptr<MIMG_instruction> tex;
6554 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6555 if (!has_lod)
6556 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6557
6558 bool div_by_6 = instr->op == nir_texop_txs &&
6559 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6560 instr->is_array &&
6561 (dmask & (1 << 2));
6562 if (tmp_dst.id() == dst.id() && div_by_6)
6563 tmp_dst = bld.tmp(tmp_dst.regClass());
6564
6565 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6566 tex->operands[0] = Operand(as_vgpr(ctx,lod));
6567 tex->operands[1] = Operand(resource);
6568 if (ctx->options->chip_class == GFX9 &&
6569 instr->op == nir_texop_txs &&
6570 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6571 instr->is_array) {
6572 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6573 } else if (instr->op == nir_texop_query_levels) {
6574 tex->dmask = 1 << 3;
6575 } else {
6576 tex->dmask = dmask;
6577 }
6578 tex->da = da;
6579 tex->definitions[0] = Definition(tmp_dst);
6580 tex->dim = dim;
6581 tex->can_reorder = true;
6582 ctx->block->instructions.emplace_back(std::move(tex));
6583
6584 if (div_by_6) {
6585 /* divide 3rd value by 6 by multiplying with magic number */
6586 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6587 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6588 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6589 assert(instr->dest.ssa.num_components == 3);
6590 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6591 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6592 emit_extract_vector(ctx, tmp_dst, 0, v1),
6593 emit_extract_vector(ctx, tmp_dst, 1, v1),
6594 by_6);
6595
6596 }
6597
6598 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6599 return;
6600 }
6601
6602 Temp tg4_compare_cube_wa64 = Temp();
6603
6604 if (tg4_integer_workarounds) {
6605 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6606 tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6607 tex->operands[1] = Operand(resource);
6608 tex->dim = dim;
6609 tex->dmask = 0x3;
6610 tex->da = da;
6611 Temp size = bld.tmp(v2);
6612 tex->definitions[0] = Definition(size);
6613 tex->can_reorder = true;
6614 ctx->block->instructions.emplace_back(std::move(tex));
6615 emit_split_vector(ctx, size, size.size());
6616
6617 Temp half_texel[2];
6618 for (unsigned i = 0; i < 2; i++) {
6619 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6620 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6621 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6622 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6623 }
6624
6625 Temp orig_coords[2] = {
6626 emit_extract_vector(ctx, coords, 0, v1),
6627 emit_extract_vector(ctx, coords, 1, v1)};
6628 Temp new_coords[2] = {
6629 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6630 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6631 };
6632
6633 if (tg4_integer_cube_workaround) {
6634 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6635 Temp desc[resource.size()];
6636 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6637 Format::PSEUDO, 1, resource.size())};
6638 split->operands[0] = Operand(resource);
6639 for (unsigned i = 0; i < resource.size(); i++) {
6640 desc[i] = bld.tmp(s1);
6641 split->definitions[i] = Definition(desc[i]);
6642 }
6643 ctx->block->instructions.emplace_back(std::move(split));
6644
6645 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6646 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6647 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6648
6649 Temp nfmt;
6650 if (stype == GLSL_TYPE_UINT) {
6651 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6652 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6653 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6654 bld.scc(compare_cube_wa));
6655 } else {
6656 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6657 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6658 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6659 bld.scc(compare_cube_wa));
6660 }
6661 tg4_compare_cube_wa64 = bld.tmp(s2);
6662 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
6663
6664 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6665
6666 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6667 Operand((uint32_t)C_008F14_NUM_FORMAT));
6668 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6669
6670 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6671 Format::PSEUDO, resource.size(), 1)};
6672 for (unsigned i = 0; i < resource.size(); i++)
6673 vec->operands[i] = Operand(desc[i]);
6674 resource = bld.tmp(resource.regClass());
6675 vec->definitions[0] = Definition(resource);
6676 ctx->block->instructions.emplace_back(std::move(vec));
6677
6678 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6679 new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6680 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6681 new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6682 }
6683
6684 if (coords.size() == 3) {
6685 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6686 new_coords[0], new_coords[1],
6687 emit_extract_vector(ctx, coords, 2, v1));
6688 } else {
6689 assert(coords.size() == 2);
6690 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6691 new_coords[0], new_coords[1]);
6692 }
6693 }
6694
6695 std::vector<Operand> args;
6696 if (has_offset)
6697 args.emplace_back(Operand(offset));
6698 if (has_bias)
6699 args.emplace_back(Operand(bias));
6700 if (has_compare)
6701 args.emplace_back(Operand(compare));
6702 if (has_derivs)
6703 args.emplace_back(Operand(derivs));
6704 args.emplace_back(Operand(coords));
6705 if (has_sample_index)
6706 args.emplace_back(Operand(sample_index));
6707 if (has_lod)
6708 args.emplace_back(lod);
6709
6710 Temp arg;
6711 if (args.size() > 1) {
6712 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6713 unsigned size = 0;
6714 for (unsigned i = 0; i < args.size(); i++) {
6715 size += args[i].size();
6716 vec->operands[i] = args[i];
6717 }
6718 RegClass rc = RegClass(RegType::vgpr, size);
6719 Temp tmp = bld.tmp(rc);
6720 vec->definitions[0] = Definition(tmp);
6721 ctx->block->instructions.emplace_back(std::move(vec));
6722 arg = tmp;
6723 } else {
6724 assert(args[0].isTemp());
6725 arg = as_vgpr(ctx, args[0].getTemp());
6726 }
6727
6728 /* we don't need the bias, sample index, compare value or offset to be
6729 * computed in WQM but if the p_create_vector copies the coordinates, then it
6730 * needs to be in WQM */
6731 if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6732 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6733 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6734 arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
6735
6736 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6737 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6738
6739 assert(coords.size() == 1);
6740 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6741 aco_opcode op;
6742 switch (last_bit) {
6743 case 1:
6744 op = aco_opcode::buffer_load_format_x; break;
6745 case 2:
6746 op = aco_opcode::buffer_load_format_xy; break;
6747 case 3:
6748 op = aco_opcode::buffer_load_format_xyz; break;
6749 case 4:
6750 op = aco_opcode::buffer_load_format_xyzw; break;
6751 default:
6752 unreachable("Tex instruction loads more than 4 components.");
6753 }
6754
6755 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6756 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6757 tmp_dst = dst;
6758 else
6759 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6760
6761 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6762 mubuf->operands[0] = Operand(coords);
6763 mubuf->operands[1] = Operand(resource);
6764 mubuf->operands[2] = Operand((uint32_t) 0);
6765 mubuf->definitions[0] = Definition(tmp_dst);
6766 mubuf->idxen = true;
6767 mubuf->can_reorder = true;
6768 ctx->block->instructions.emplace_back(std::move(mubuf));
6769
6770 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6771 return;
6772 }
6773
6774
6775 if (instr->op == nir_texop_txf ||
6776 instr->op == nir_texop_txf_ms ||
6777 instr->op == nir_texop_samples_identical) {
6778 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6779 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6780 tex->operands[0] = Operand(arg);
6781 tex->operands[1] = Operand(resource);
6782 tex->dim = dim;
6783 tex->dmask = dmask;
6784 tex->unrm = true;
6785 tex->da = da;
6786 tex->definitions[0] = Definition(tmp_dst);
6787 tex->can_reorder = true;
6788 ctx->block->instructions.emplace_back(std::move(tex));
6789
6790 if (instr->op == nir_texop_samples_identical) {
6791 assert(dmask == 1 && dst.regClass() == v1);
6792 assert(dst.id() != tmp_dst.id());
6793
6794 Temp tmp = bld.tmp(s2);
6795 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6796 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6797
6798 } else {
6799 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6800 }
6801 return;
6802 }
6803
6804 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6805 aco_opcode opcode = aco_opcode::image_sample;
6806 if (has_offset) { /* image_sample_*_o */
6807 if (has_compare) {
6808 opcode = aco_opcode::image_sample_c_o;
6809 if (has_derivs)
6810 opcode = aco_opcode::image_sample_c_d_o;
6811 if (has_bias)
6812 opcode = aco_opcode::image_sample_c_b_o;
6813 if (level_zero)
6814 opcode = aco_opcode::image_sample_c_lz_o;
6815 if (has_lod)
6816 opcode = aco_opcode::image_sample_c_l_o;
6817 } else {
6818 opcode = aco_opcode::image_sample_o;
6819 if (has_derivs)
6820 opcode = aco_opcode::image_sample_d_o;
6821 if (has_bias)
6822 opcode = aco_opcode::image_sample_b_o;
6823 if (level_zero)
6824 opcode = aco_opcode::image_sample_lz_o;
6825 if (has_lod)
6826 opcode = aco_opcode::image_sample_l_o;
6827 }
6828 } else { /* no offset */
6829 if (has_compare) {
6830 opcode = aco_opcode::image_sample_c;
6831 if (has_derivs)
6832 opcode = aco_opcode::image_sample_c_d;
6833 if (has_bias)
6834 opcode = aco_opcode::image_sample_c_b;
6835 if (level_zero)
6836 opcode = aco_opcode::image_sample_c_lz;
6837 if (has_lod)
6838 opcode = aco_opcode::image_sample_c_l;
6839 } else {
6840 opcode = aco_opcode::image_sample;
6841 if (has_derivs)
6842 opcode = aco_opcode::image_sample_d;
6843 if (has_bias)
6844 opcode = aco_opcode::image_sample_b;
6845 if (level_zero)
6846 opcode = aco_opcode::image_sample_lz;
6847 if (has_lod)
6848 opcode = aco_opcode::image_sample_l;
6849 }
6850 }
6851
6852 if (instr->op == nir_texop_tg4) {
6853 if (has_offset) {
6854 opcode = aco_opcode::image_gather4_lz_o;
6855 if (has_compare)
6856 opcode = aco_opcode::image_gather4_c_lz_o;
6857 } else {
6858 opcode = aco_opcode::image_gather4_lz;
6859 if (has_compare)
6860 opcode = aco_opcode::image_gather4_c_lz;
6861 }
6862 } else if (instr->op == nir_texop_lod) {
6863 opcode = aco_opcode::image_get_lod;
6864 }
6865
6866 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6867 tex->operands[0] = Operand(arg);
6868 tex->operands[1] = Operand(resource);
6869 tex->operands[2] = Operand(sampler);
6870 tex->dim = dim;
6871 tex->dmask = dmask;
6872 tex->da = da;
6873 tex->definitions[0] = Definition(tmp_dst);
6874 tex->can_reorder = true;
6875 ctx->block->instructions.emplace_back(std::move(tex));
6876
6877 if (tg4_integer_cube_workaround) {
6878 assert(tmp_dst.id() != dst.id());
6879 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6880
6881 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6882 Temp val[4];
6883 for (unsigned i = 0; i < dst.size(); i++) {
6884 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6885 Temp cvt_val;
6886 if (stype == GLSL_TYPE_UINT)
6887 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6888 else
6889 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6890 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6891 }
6892 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6893 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6894 val[0], val[1], val[2], val[3]);
6895 }
6896 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6897 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6898
6899 }
6900
6901
6902 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6903 {
6904 Temp tmp = get_ssa_temp(ctx, ssa);
6905 if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6906 return Operand(tmp.regClass());
6907 else
6908 return Operand(tmp);
6909 }
6910
6911 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6912 {
6913 aco_ptr<Pseudo_instruction> phi;
6914 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6915 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2);
6916
6917 bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
6918 logical |= ctx->block->kind & block_kind_merge;
6919 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6920
6921 /* we want a sorted list of sources, since the predecessor list is also sorted */
6922 std::map<unsigned, nir_ssa_def*> phi_src;
6923 nir_foreach_phi_src(src, instr)
6924 phi_src[src->pred->index] = src->src.ssa;
6925
6926 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
6927 unsigned num_operands = 0;
6928 Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())];
6929 unsigned num_defined = 0;
6930 unsigned cur_pred_idx = 0;
6931 for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
6932 if (cur_pred_idx < preds.size()) {
6933 /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
6934 unsigned block = ctx->cf_info.nir_to_aco[src.first];
6935 unsigned skipped = 0;
6936 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
6937 skipped++;
6938 if (cur_pred_idx + skipped < preds.size()) {
6939 for (unsigned i = 0; i < skipped; i++)
6940 operands[num_operands++] = Operand(dst.regClass());
6941 cur_pred_idx += skipped;
6942 } else {
6943 continue;
6944 }
6945 }
6946 cur_pred_idx++;
6947 Operand op = get_phi_operand(ctx, src.second);
6948 operands[num_operands++] = op;
6949 num_defined += !op.isUndefined();
6950 }
6951 /* handle block_kind_continue_or_break at loop exit blocks */
6952 while (cur_pred_idx++ < preds.size())
6953 operands[num_operands++] = Operand(dst.regClass());
6954
6955 if (num_defined == 0) {
6956 Builder bld(ctx->program, ctx->block);
6957 if (dst.regClass() == s1) {
6958 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6959 } else if (dst.regClass() == v1) {
6960 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6961 } else {
6962 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6963 for (unsigned i = 0; i < dst.size(); i++)
6964 vec->operands[i] = Operand(0u);
6965 vec->definitions[0] = Definition(dst);
6966 ctx->block->instructions.emplace_back(std::move(vec));
6967 }
6968 return;
6969 }
6970
6971 /* we can use a linear phi in some cases if one src is undef */
6972 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
6973 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
6974
6975 Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
6976 Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
6977 assert(invert->kind & block_kind_invert);
6978
6979 unsigned then_block = invert->linear_preds[0];
6980
6981 Block* insert_block = NULL;
6982 for (unsigned i = 0; i < num_operands; i++) {
6983 Operand op = operands[i];
6984 if (op.isUndefined())
6985 continue;
6986 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
6987 phi->operands[0] = op;
6988 break;
6989 }
6990 assert(insert_block); /* should be handled by the "num_defined == 0" case above */
6991 phi->operands[1] = Operand(dst.regClass());
6992 phi->definitions[0] = Definition(dst);
6993 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
6994 return;
6995 }
6996
6997 /* try to scalarize vector phis */
6998 if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
6999 // TODO: scalarize linear phis on divergent ifs
7000 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
7001 std::array<Temp, 4> new_vec;
7002 for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
7003 Operand src = operands[i];
7004 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
7005 can_scalarize = false;
7006 }
7007 if (can_scalarize) {
7008 unsigned num_components = instr->dest.ssa.num_components;
7009 assert(dst.size() % num_components == 0);
7010 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
7011
7012 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
7013 for (unsigned k = 0; k < num_components; k++) {
7014 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
7015 for (unsigned i = 0; i < num_operands; i++) {
7016 Operand src = operands[i];
7017 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
7018 }
7019 Temp phi_dst = {ctx->program->allocateId(), rc};
7020 phi->definitions[0] = Definition(phi_dst);
7021 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
7022 new_vec[k] = phi_dst;
7023 vec->operands[k] = Operand(phi_dst);
7024 }
7025 vec->definitions[0] = Definition(dst);
7026 ctx->block->instructions.emplace_back(std::move(vec));
7027 ctx->allocated_vec.emplace(dst.id(), new_vec);
7028 return;
7029 }
7030 }
7031
7032 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
7033 for (unsigned i = 0; i < num_operands; i++)
7034 phi->operands[i] = operands[i];
7035 phi->definitions[0] = Definition(dst);
7036 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
7037 }
7038
7039
7040 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
7041 {
7042 Temp dst = get_ssa_temp(ctx, &instr->def);
7043
7044 assert(dst.type() == RegType::sgpr);
7045
7046 if (dst.size() == 1) {
7047 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
7048 } else {
7049 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
7050 for (unsigned i = 0; i < dst.size(); i++)
7051 vec->operands[i] = Operand(0u);
7052 vec->definitions[0] = Definition(dst);
7053 ctx->block->instructions.emplace_back(std::move(vec));
7054 }
7055 }
7056
7057 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
7058 {
7059 Builder bld(ctx->program, ctx->block);
7060 Block *logical_target;
7061 append_logical_end(ctx->block);
7062 unsigned idx = ctx->block->index;
7063
7064 switch (instr->type) {
7065 case nir_jump_break:
7066 logical_target = ctx->cf_info.parent_loop.exit;
7067 add_logical_edge(idx, logical_target);
7068 ctx->block->kind |= block_kind_break;
7069
7070 if (!ctx->cf_info.parent_if.is_divergent &&
7071 !ctx->cf_info.parent_loop.has_divergent_continue) {
7072 /* uniform break - directly jump out of the loop */
7073 ctx->block->kind |= block_kind_uniform;
7074 ctx->cf_info.has_branch = true;
7075 bld.branch(aco_opcode::p_branch);
7076 add_linear_edge(idx, logical_target);
7077 return;
7078 }
7079 ctx->cf_info.parent_loop.has_divergent_branch = true;
7080 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
7081 break;
7082 case nir_jump_continue:
7083 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7084 add_logical_edge(idx, logical_target);
7085 ctx->block->kind |= block_kind_continue;
7086
7087 if (ctx->cf_info.parent_if.is_divergent) {
7088 /* for potential uniform breaks after this continue,
7089 we must ensure that they are handled correctly */
7090 ctx->cf_info.parent_loop.has_divergent_continue = true;
7091 ctx->cf_info.parent_loop.has_divergent_branch = true;
7092 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
7093 } else {
7094 /* uniform continue - directly jump to the loop header */
7095 ctx->block->kind |= block_kind_uniform;
7096 ctx->cf_info.has_branch = true;
7097 bld.branch(aco_opcode::p_branch);
7098 add_linear_edge(idx, logical_target);
7099 return;
7100 }
7101 break;
7102 default:
7103 fprintf(stderr, "Unknown NIR jump instr: ");
7104 nir_print_instr(&instr->instr, stderr);
7105 fprintf(stderr, "\n");
7106 abort();
7107 }
7108
7109 /* remove critical edges from linear CFG */
7110 bld.branch(aco_opcode::p_branch);
7111 Block* break_block = ctx->program->create_and_insert_block();
7112 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7113 break_block->kind |= block_kind_uniform;
7114 add_linear_edge(idx, break_block);
7115 /* the loop_header pointer might be invalidated by this point */
7116 if (instr->type == nir_jump_continue)
7117 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7118 add_linear_edge(break_block->index, logical_target);
7119 bld.reset(break_block);
7120 bld.branch(aco_opcode::p_branch);
7121
7122 Block* continue_block = ctx->program->create_and_insert_block();
7123 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7124 add_linear_edge(idx, continue_block);
7125 append_logical_start(continue_block);
7126 ctx->block = continue_block;
7127 return;
7128 }
7129
7130 void visit_block(isel_context *ctx, nir_block *block)
7131 {
7132 nir_foreach_instr(instr, block) {
7133 switch (instr->type) {
7134 case nir_instr_type_alu:
7135 visit_alu_instr(ctx, nir_instr_as_alu(instr));
7136 break;
7137 case nir_instr_type_load_const:
7138 visit_load_const(ctx, nir_instr_as_load_const(instr));
7139 break;
7140 case nir_instr_type_intrinsic:
7141 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
7142 break;
7143 case nir_instr_type_tex:
7144 visit_tex(ctx, nir_instr_as_tex(instr));
7145 break;
7146 case nir_instr_type_phi:
7147 visit_phi(ctx, nir_instr_as_phi(instr));
7148 break;
7149 case nir_instr_type_ssa_undef:
7150 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
7151 break;
7152 case nir_instr_type_deref:
7153 break;
7154 case nir_instr_type_jump:
7155 visit_jump(ctx, nir_instr_as_jump(instr));
7156 break;
7157 default:
7158 fprintf(stderr, "Unknown NIR instr type: ");
7159 nir_print_instr(instr, stderr);
7160 fprintf(stderr, "\n");
7161 //abort();
7162 }
7163 }
7164
7165 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7166 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
7167 }
7168
7169
7170
7171 static void visit_loop(isel_context *ctx, nir_loop *loop)
7172 {
7173 append_logical_end(ctx->block);
7174 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
7175 Builder bld(ctx->program, ctx->block);
7176 bld.branch(aco_opcode::p_branch);
7177 unsigned loop_preheader_idx = ctx->block->index;
7178
7179 Block loop_exit = Block();
7180 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7181 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
7182
7183 Block* loop_header = ctx->program->create_and_insert_block();
7184 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
7185 loop_header->kind |= block_kind_loop_header;
7186 add_edge(loop_preheader_idx, loop_header);
7187 ctx->block = loop_header;
7188
7189 /* emit loop body */
7190 unsigned loop_header_idx = loop_header->index;
7191 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
7192 append_logical_start(ctx->block);
7193 visit_cf_list(ctx, &loop->body);
7194
7195 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
7196 if (!ctx->cf_info.has_branch) {
7197 append_logical_end(ctx->block);
7198 if (ctx->cf_info.exec_potentially_empty) {
7199 /* Discards can result in code running with an empty exec mask.
7200 * This would result in divergent breaks not ever being taken. As a
7201 * workaround, break the loop when the loop mask is empty instead of
7202 * always continuing. */
7203 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
7204 unsigned block_idx = ctx->block->index;
7205
7206 /* create helper blocks to avoid critical edges */
7207 Block *break_block = ctx->program->create_and_insert_block();
7208 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7209 break_block->kind = block_kind_uniform;
7210 bld.reset(break_block);
7211 bld.branch(aco_opcode::p_branch);
7212 add_linear_edge(block_idx, break_block);
7213 add_linear_edge(break_block->index, &loop_exit);
7214
7215 Block *continue_block = ctx->program->create_and_insert_block();
7216 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7217 continue_block->kind = block_kind_uniform;
7218 bld.reset(continue_block);
7219 bld.branch(aco_opcode::p_branch);
7220 add_linear_edge(block_idx, continue_block);
7221 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
7222
7223 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
7224 ctx->block = &ctx->program->blocks[block_idx];
7225 } else {
7226 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
7227 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7228 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7229 else
7230 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7231 }
7232
7233 bld.reset(ctx->block);
7234 bld.branch(aco_opcode::p_branch);
7235 }
7236
7237 /* fixup phis in loop header from unreachable blocks */
7238 if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
7239 bool linear = ctx->cf_info.has_branch;
7240 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
7241 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
7242 if ((logical && instr->opcode == aco_opcode::p_phi) ||
7243 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
7244 /* the last operand should be the one that needs to be removed */
7245 instr->operands.pop_back();
7246 } else if (!is_phi(instr)) {
7247 break;
7248 }
7249 }
7250 }
7251
7252 ctx->cf_info.has_branch = false;
7253
7254 // TODO: if the loop has not a single exit, we must add one °°
7255 /* emit loop successor block */
7256 ctx->block = ctx->program->insert_block(std::move(loop_exit));
7257 append_logical_start(ctx->block);
7258
7259 #if 0
7260 // TODO: check if it is beneficial to not branch on continues
7261 /* trim linear phis in loop header */
7262 for (auto&& instr : loop_entry->instructions) {
7263 if (instr->opcode == aco_opcode::p_linear_phi) {
7264 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7265 new_phi->definitions[0] = instr->definitions[0];
7266 for (unsigned i = 0; i < new_phi->operands.size(); i++)
7267 new_phi->operands[i] = instr->operands[i];
7268 /* check that the remaining operands are all the same */
7269 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7270 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7271 instr.swap(new_phi);
7272 } else if (instr->opcode == aco_opcode::p_phi) {
7273 continue;
7274 } else {
7275 break;
7276 }
7277 }
7278 #endif
7279 }
7280
7281 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7282 {
7283 ic->cond = cond;
7284
7285 append_logical_end(ctx->block);
7286 ctx->block->kind |= block_kind_branch;
7287
7288 /* branch to linear then block */
7289 assert(cond.regClass() == s2);
7290 aco_ptr<Pseudo_branch_instruction> branch;
7291 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7292 branch->operands[0] = Operand(cond);
7293 ctx->block->instructions.push_back(std::move(branch));
7294
7295 ic->BB_if_idx = ctx->block->index;
7296 ic->BB_invert = Block();
7297 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7298 /* Invert blocks are intentionally not marked as top level because they
7299 * are not part of the logical cfg. */
7300 ic->BB_invert.kind |= block_kind_invert;
7301 ic->BB_endif = Block();
7302 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7303 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7304
7305 ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7306 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7307 ctx->cf_info.parent_if.is_divergent = true;
7308 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7309
7310 /** emit logical then block */
7311 Block* BB_then_logical = ctx->program->create_and_insert_block();
7312 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7313 add_edge(ic->BB_if_idx, BB_then_logical);
7314 ctx->block = BB_then_logical;
7315 append_logical_start(BB_then_logical);
7316 }
7317
7318 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7319 {
7320 Block *BB_then_logical = ctx->block;
7321 append_logical_end(BB_then_logical);
7322 /* branch from logical then block to invert block */
7323 aco_ptr<Pseudo_branch_instruction> branch;
7324 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7325 BB_then_logical->instructions.emplace_back(std::move(branch));
7326 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7327 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7328 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7329 BB_then_logical->kind |= block_kind_uniform;
7330 assert(!ctx->cf_info.has_branch);
7331 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7332 ctx->cf_info.parent_loop.has_divergent_branch = false;
7333
7334 /** emit linear then block */
7335 Block* BB_then_linear = ctx->program->create_and_insert_block();
7336 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7337 BB_then_linear->kind |= block_kind_uniform;
7338 add_linear_edge(ic->BB_if_idx, BB_then_linear);
7339 /* branch from linear then block to invert block */
7340 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7341 BB_then_linear->instructions.emplace_back(std::move(branch));
7342 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7343
7344 /** emit invert merge block */
7345 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7346 ic->invert_idx = ctx->block->index;
7347
7348 /* branch to linear else block (skip else) */
7349 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7350 branch->operands[0] = Operand(ic->cond);
7351 ctx->block->instructions.push_back(std::move(branch));
7352
7353 ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7354 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7355
7356 /** emit logical else block */
7357 Block* BB_else_logical = ctx->program->create_and_insert_block();
7358 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7359 add_logical_edge(ic->BB_if_idx, BB_else_logical);
7360 add_linear_edge(ic->invert_idx, BB_else_logical);
7361 ctx->block = BB_else_logical;
7362 append_logical_start(BB_else_logical);
7363 }
7364
7365 static void end_divergent_if(isel_context *ctx, if_context *ic)
7366 {
7367 Block *BB_else_logical = ctx->block;
7368 append_logical_end(BB_else_logical);
7369
7370 /* branch from logical else block to endif block */
7371 aco_ptr<Pseudo_branch_instruction> branch;
7372 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7373 BB_else_logical->instructions.emplace_back(std::move(branch));
7374 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7375 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7376 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7377 BB_else_logical->kind |= block_kind_uniform;
7378
7379 assert(!ctx->cf_info.has_branch);
7380 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7381
7382
7383 /** emit linear else block */
7384 Block* BB_else_linear = ctx->program->create_and_insert_block();
7385 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7386 BB_else_linear->kind |= block_kind_uniform;
7387 add_linear_edge(ic->invert_idx, BB_else_linear);
7388
7389 /* branch from linear else block to endif block */
7390 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7391 BB_else_linear->instructions.emplace_back(std::move(branch));
7392 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7393
7394
7395 /** emit endif merge block */
7396 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7397 append_logical_start(ctx->block);
7398
7399
7400 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7401 ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7402 /* uniform control flow never has an empty exec-mask */
7403 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7404 ctx->cf_info.exec_potentially_empty = false;
7405 }
7406
7407 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7408 {
7409 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7410 Builder bld(ctx->program, ctx->block);
7411 aco_ptr<Pseudo_branch_instruction> branch;
7412
7413 if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7414 /**
7415 * Uniform conditionals are represented in the following way*) :
7416 *
7417 * The linear and logical CFG:
7418 * BB_IF
7419 * / \
7420 * BB_THEN (logical) BB_ELSE (logical)
7421 * \ /
7422 * BB_ENDIF
7423 *
7424 * *) Exceptions may be due to break and continue statements within loops
7425 * If a break/continue happens within uniform control flow, it branches
7426 * to the loop exit/entry block. Otherwise, it branches to the next
7427 * merge block.
7428 **/
7429 append_logical_end(ctx->block);
7430 ctx->block->kind |= block_kind_uniform;
7431
7432 /* emit branch */
7433 assert(cond.regClass() == s2);
7434 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7435 cond = bool_to_scalar_condition(ctx, cond);
7436
7437 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7438 branch->operands[0] = Operand(cond);
7439 branch->operands[0].setFixed(scc);
7440 ctx->block->instructions.emplace_back(std::move(branch));
7441
7442 unsigned BB_if_idx = ctx->block->index;
7443 Block BB_endif = Block();
7444 BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7445 BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7446
7447 /** emit then block */
7448 Block* BB_then = ctx->program->create_and_insert_block();
7449 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7450 add_edge(BB_if_idx, BB_then);
7451 append_logical_start(BB_then);
7452 ctx->block = BB_then;
7453 visit_cf_list(ctx, &if_stmt->then_list);
7454 BB_then = ctx->block;
7455 bool then_branch = ctx->cf_info.has_branch;
7456 bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7457
7458 if (!then_branch) {
7459 append_logical_end(BB_then);
7460 /* branch from then block to endif block */
7461 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7462 BB_then->instructions.emplace_back(std::move(branch));
7463 add_linear_edge(BB_then->index, &BB_endif);
7464 if (!then_branch_divergent)
7465 add_logical_edge(BB_then->index, &BB_endif);
7466 BB_then->kind |= block_kind_uniform;
7467 }
7468
7469 ctx->cf_info.has_branch = false;
7470 ctx->cf_info.parent_loop.has_divergent_branch = false;
7471
7472 /** emit else block */
7473 Block* BB_else = ctx->program->create_and_insert_block();
7474 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7475 add_edge(BB_if_idx, BB_else);
7476 append_logical_start(BB_else);
7477 ctx->block = BB_else;
7478 visit_cf_list(ctx, &if_stmt->else_list);
7479 BB_else = ctx->block;
7480
7481 if (!ctx->cf_info.has_branch) {
7482 append_logical_end(BB_else);
7483 /* branch from then block to endif block */
7484 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7485 BB_else->instructions.emplace_back(std::move(branch));
7486 add_linear_edge(BB_else->index, &BB_endif);
7487 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7488 add_logical_edge(BB_else->index, &BB_endif);
7489 BB_else->kind |= block_kind_uniform;
7490 }
7491
7492 ctx->cf_info.has_branch &= then_branch;
7493 ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7494
7495 /** emit endif merge block */
7496 if (!ctx->cf_info.has_branch) {
7497 ctx->block = ctx->program->insert_block(std::move(BB_endif));
7498 append_logical_start(ctx->block);
7499 }
7500 } else { /* non-uniform condition */
7501 /**
7502 * To maintain a logical and linear CFG without critical edges,
7503 * non-uniform conditionals are represented in the following way*) :
7504 *
7505 * The linear CFG:
7506 * BB_IF
7507 * / \
7508 * BB_THEN (logical) BB_THEN (linear)
7509 * \ /
7510 * BB_INVERT (linear)
7511 * / \
7512 * BB_ELSE (logical) BB_ELSE (linear)
7513 * \ /
7514 * BB_ENDIF
7515 *
7516 * The logical CFG:
7517 * BB_IF
7518 * / \
7519 * BB_THEN (logical) BB_ELSE (logical)
7520 * \ /
7521 * BB_ENDIF
7522 *
7523 * *) Exceptions may be due to break and continue statements within loops
7524 **/
7525
7526 if_context ic;
7527
7528 begin_divergent_if_then(ctx, &ic, cond);
7529 visit_cf_list(ctx, &if_stmt->then_list);
7530
7531 begin_divergent_if_else(ctx, &ic);
7532 visit_cf_list(ctx, &if_stmt->else_list);
7533
7534 end_divergent_if(ctx, &ic);
7535 }
7536 }
7537
7538 static void visit_cf_list(isel_context *ctx,
7539 struct exec_list *list)
7540 {
7541 foreach_list_typed(nir_cf_node, node, node, list) {
7542 switch (node->type) {
7543 case nir_cf_node_block:
7544 visit_block(ctx, nir_cf_node_as_block(node));
7545 break;
7546 case nir_cf_node_if:
7547 visit_if(ctx, nir_cf_node_as_if(node));
7548 break;
7549 case nir_cf_node_loop:
7550 visit_loop(ctx, nir_cf_node_as_loop(node));
7551 break;
7552 default:
7553 unreachable("unimplemented cf list type");
7554 }
7555 }
7556 }
7557
7558 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7559 {
7560 int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7561 uint64_t mask = ctx->vs_output.mask[slot];
7562 if (!is_pos && !mask)
7563 return;
7564 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7565 return;
7566 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7567 exp->enabled_mask = mask;
7568 for (unsigned i = 0; i < 4; ++i) {
7569 if (mask & (1 << i))
7570 exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7571 else
7572 exp->operands[i] = Operand(v1);
7573 }
7574 exp->valid_mask = false;
7575 exp->done = false;
7576 exp->compressed = false;
7577 if (is_pos)
7578 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7579 else
7580 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7581 ctx->block->instructions.emplace_back(std::move(exp));
7582 }
7583
7584 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7585 {
7586 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7587 exp->enabled_mask = 0;
7588 for (unsigned i = 0; i < 4; ++i)
7589 exp->operands[i] = Operand(v1);
7590 if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7591 exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7592 exp->enabled_mask |= 0x1;
7593 }
7594 if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7595 exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7596 exp->enabled_mask |= 0x4;
7597 }
7598 if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7599 if (ctx->options->chip_class < GFX9) {
7600 exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7601 exp->enabled_mask |= 0x8;
7602 } else {
7603 Builder bld(ctx->program, ctx->block);
7604
7605 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7606 Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7607 if (exp->operands[2].isTemp())
7608 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7609
7610 exp->operands[2] = Operand(out);
7611 exp->enabled_mask |= 0x4;
7612 }
7613 }
7614 exp->valid_mask = false;
7615 exp->done = false;
7616 exp->compressed = false;
7617 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7618 ctx->block->instructions.emplace_back(std::move(exp));
7619 }
7620
7621 static void create_vs_exports(isel_context *ctx)
7622 {
7623 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7624
7625 if (outinfo->export_prim_id) {
7626 ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7627 ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
7628 }
7629
7630 if (ctx->options->key.has_multiview_view_index) {
7631 ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7632 ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
7633 }
7634
7635 /* the order these position exports are created is important */
7636 int next_pos = 0;
7637 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7638 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7639 export_vs_psiz_layer_viewport(ctx, &next_pos);
7640 }
7641 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7642 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7643 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7644 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7645
7646 if (ctx->options->key.vs_common_out.export_clip_dists) {
7647 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7648 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7649 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7650 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7651 }
7652
7653 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7654 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7655 i != VARYING_SLOT_PRIMITIVE_ID)
7656 continue;
7657
7658 export_vs_varying(ctx, i, false, NULL);
7659 }
7660 }
7661
7662 static void emit_stream_output(isel_context *ctx,
7663 Temp const *so_buffers,
7664 Temp const *so_write_offset,
7665 const struct radv_stream_output *output)
7666 {
7667 unsigned num_comps = util_bitcount(output->component_mask);
7668 unsigned loc = output->location;
7669 unsigned buf = output->buffer;
7670 unsigned offset = output->offset;
7671
7672 assert(num_comps && num_comps <= 4);
7673 if (!num_comps || num_comps > 4)
7674 return;
7675
7676 unsigned start = ffs(output->component_mask) - 1;
7677
7678 Temp out[4];
7679 bool all_undef = true;
7680 assert(ctx->stage == vertex_vs);
7681 for (unsigned i = 0; i < num_comps; i++) {
7682 out[i] = ctx->vs_output.outputs[loc][start + i];
7683 all_undef = all_undef && !out[i].id();
7684 }
7685 if (all_undef)
7686 return;
7687
7688 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7689 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7690 for (unsigned i = 0; i < num_comps; ++i)
7691 vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7692 vec->definitions[0] = Definition(write_data);
7693 ctx->block->instructions.emplace_back(std::move(vec));
7694
7695 aco_opcode opcode;
7696 switch (num_comps) {
7697 case 1:
7698 opcode = aco_opcode::buffer_store_dword;
7699 break;
7700 case 2:
7701 opcode = aco_opcode::buffer_store_dwordx2;
7702 break;
7703 case 3:
7704 opcode = aco_opcode::buffer_store_dwordx3;
7705 break;
7706 case 4:
7707 opcode = aco_opcode::buffer_store_dwordx4;
7708 break;
7709 }
7710
7711 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7712 store->operands[0] = Operand(so_write_offset[buf]);
7713 store->operands[1] = Operand(so_buffers[buf]);
7714 store->operands[2] = Operand((uint32_t) 0);
7715 store->operands[3] = Operand(write_data);
7716 if (offset > 4095) {
7717 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7718 Builder bld(ctx->program, ctx->block);
7719 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7720 } else {
7721 store->offset = offset;
7722 }
7723 store->offen = true;
7724 store->glc = true;
7725 store->dlc = false;
7726 store->slc = true;
7727 store->can_reorder = true;
7728 ctx->block->instructions.emplace_back(std::move(store));
7729 }
7730
7731 static void emit_streamout(isel_context *ctx, unsigned stream)
7732 {
7733 Builder bld(ctx->program, ctx->block);
7734
7735 Temp so_buffers[4];
7736 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
7737 for (unsigned i = 0; i < 4; i++) {
7738 unsigned stride = ctx->program->info->so.strides[i];
7739 if (!stride)
7740 continue;
7741
7742 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7743 }
7744
7745 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7746 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
7747
7748 Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7749 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7750
7751 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7752
7753 if_context ic;
7754 begin_divergent_if_then(ctx, &ic, can_emit);
7755
7756 bld.reset(ctx->block);
7757
7758 Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
7759
7760 Temp so_write_offset[4];
7761
7762 for (unsigned i = 0; i < 4; i++) {
7763 unsigned stride = ctx->program->info->so.strides[i];
7764 if (!stride)
7765 continue;
7766
7767 if (stride == 1) {
7768 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7769 get_arg(ctx, ctx->args->streamout_write_idx),
7770 get_arg(ctx, ctx->args->streamout_offset[i]));
7771 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7772
7773 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7774 } else {
7775 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7776 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
7777 get_arg(ctx, ctx->args->streamout_offset[i]));
7778 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7779 }
7780 }
7781
7782 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7783 struct radv_stream_output *output =
7784 &ctx->program->info->so.outputs[i];
7785 if (stream != output->stream)
7786 continue;
7787
7788 emit_stream_output(ctx, so_buffers, so_write_offset, output);
7789 }
7790
7791 begin_divergent_if_else(ctx, &ic);
7792 end_divergent_if(ctx, &ic);
7793 }
7794
7795 } /* end namespace */
7796
7797 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
7798 {
7799 /* Split all arguments except for the first (ring_offsets) and the last
7800 * (exec) so that the dead channels don't stay live throughout the program.
7801 */
7802 for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
7803 if (startpgm->definitions[i].regClass().size() > 1) {
7804 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
7805 startpgm->definitions[i].regClass().size());
7806 }
7807 }
7808 }
7809
7810 void handle_bc_optimize(isel_context *ctx)
7811 {
7812 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7813 Builder bld(ctx->program, ctx->block);
7814 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7815 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7816 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7817 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
7818 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
7819 if (uses_center && uses_centroid) {
7820 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)),
7821 get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
7822
7823 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7824 Temp new_coord[2];
7825 for (unsigned i = 0; i < 2; i++) {
7826 Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
7827 Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
7828 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7829 persp_centroid, persp_center, sel);
7830 }
7831 ctx->persp_centroid = bld.tmp(v2);
7832 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
7833 Operand(new_coord[0]), Operand(new_coord[1]));
7834 emit_split_vector(ctx, ctx->persp_centroid, 2);
7835 }
7836
7837 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7838 Temp new_coord[2];
7839 for (unsigned i = 0; i < 2; i++) {
7840 Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
7841 Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
7842 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7843 linear_centroid, linear_center, sel);
7844 }
7845 ctx->linear_centroid = bld.tmp(v2);
7846 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
7847 Operand(new_coord[0]), Operand(new_coord[1]));
7848 emit_split_vector(ctx, ctx->linear_centroid, 2);
7849 }
7850 }
7851 }
7852
7853 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
7854 {
7855 Program *program = ctx->program;
7856
7857 unsigned float_controls = shader->info.float_controls_execution_mode;
7858
7859 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
7860 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
7861 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
7862 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
7863 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
7864
7865 program->next_fp_mode.must_flush_denorms32 =
7866 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
7867 program->next_fp_mode.must_flush_denorms16_64 =
7868 float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
7869 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
7870
7871 program->next_fp_mode.care_about_round32 =
7872 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
7873
7874 program->next_fp_mode.care_about_round16_64 =
7875 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
7876 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
7877
7878 /* default to preserving fp16 and fp64 denorms, since it's free */
7879 if (program->next_fp_mode.must_flush_denorms16_64)
7880 program->next_fp_mode.denorm16_64 = 0;
7881 else
7882 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
7883
7884 /* preserving fp32 denorms is expensive, so only do it if asked */
7885 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
7886 program->next_fp_mode.denorm32 = fp_denorm_keep;
7887 else
7888 program->next_fp_mode.denorm32 = 0;
7889
7890 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
7891 program->next_fp_mode.round32 = fp_round_tz;
7892 else
7893 program->next_fp_mode.round32 = fp_round_ne;
7894
7895 if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
7896 program->next_fp_mode.round16_64 = fp_round_tz;
7897 else
7898 program->next_fp_mode.round16_64 = fp_round_ne;
7899
7900 ctx->block->fp_mode = program->next_fp_mode;
7901 }
7902
7903 void select_program(Program *program,
7904 unsigned shader_count,
7905 struct nir_shader *const *shaders,
7906 ac_shader_config* config,
7907 struct radv_shader_args *args)
7908 {
7909 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
7910
7911 for (unsigned i = 0; i < shader_count; i++) {
7912 nir_shader *nir = shaders[i];
7913 init_context(&ctx, nir);
7914
7915 setup_fp_mode(&ctx, nir);
7916
7917 if (!i) {
7918 /* needs to be after init_context() for FS */
7919 Pseudo_instruction *startpgm = add_startpgm(&ctx);
7920 append_logical_start(ctx.block);
7921 split_arguments(&ctx, startpgm);
7922 }
7923
7924 if_context ic;
7925 if (shader_count >= 2) {
7926 Builder bld(ctx.program, ctx.block);
7927 Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7928 Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7929 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7930 Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7931
7932 begin_divergent_if_then(&ctx, &ic, cond);
7933 }
7934
7935 if (i) {
7936 Builder bld(ctx.program, ctx.block);
7937 bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7938 bld.sopp(aco_opcode::s_barrier);
7939 }
7940
7941 if (ctx.stage == fragment_fs)
7942 handle_bc_optimize(&ctx);
7943
7944 nir_function_impl *func = nir_shader_get_entrypoint(nir);
7945 visit_cf_list(&ctx, &func->body);
7946
7947 if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7948 emit_streamout(&ctx, 0);
7949
7950 if (ctx.stage == vertex_vs)
7951 create_vs_exports(&ctx);
7952
7953 if (shader_count >= 2) {
7954 begin_divergent_if_else(&ctx, &ic);
7955 end_divergent_if(&ctx, &ic);
7956 }
7957
7958 ralloc_free(ctx.divergent_vals);
7959 }
7960
7961 program->config->float_mode = program->blocks[0].fp_mode.val;
7962
7963 append_logical_end(ctx.block);
7964 ctx.block->kind |= block_kind_uniform;
7965 Builder bld(ctx.program, ctx.block);
7966 if (ctx.program->wb_smem_l1_on_end)
7967 bld.smem(aco_opcode::s_dcache_wb, false);
7968 bld.sopp(aco_opcode::s_endpgm);
7969
7970 /* cleanup CFG */
7971 for (Block& BB : program->blocks) {
7972 for (unsigned idx : BB.linear_preds)
7973 program->blocks[idx].linear_succs.emplace_back(BB.index);
7974 for (unsigned idx : BB.logical_preds)
7975 program->blocks[idx].logical_succs.emplace_back(BB.index);
7976 }
7977 }
7978 }