aco: ensure that uniform booleans are computed in WQM if their uses happen in WQM
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
127 {
128 Builder bld(ctx->program, ctx->block);
129
130 if (!dst.id())
131 dst = bld.tmp(src.regClass());
132
133 if (ctx->stage != fragment_fs) {
134 if (!dst.id())
135 return src;
136
137 if (src.type() == RegType::vgpr || src.size() > 1)
138 bld.copy(Definition(dst), src);
139 else
140 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
141 return dst;
142 }
143
144 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
145 ctx->program->needs_wqm |= program_needs_wqm;
146 return dst;
147 }
148
149 Temp as_vgpr(isel_context *ctx, Temp val)
150 {
151 if (val.type() == RegType::sgpr) {
152 Builder bld(ctx->program, ctx->block);
153 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
154 }
155 assert(val.type() == RegType::vgpr);
156 return val;
157 }
158
159 //assumes a != 0xffffffff
160 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
161 {
162 assert(b != 0);
163 Builder bld(ctx->program, ctx->block);
164
165 if (util_is_power_of_two_or_zero(b)) {
166 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
167 return;
168 }
169
170 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
171
172 assert(info.multiplier <= 0xffffffff);
173
174 bool pre_shift = info.pre_shift != 0;
175 bool increment = info.increment != 0;
176 bool multiply = true;
177 bool post_shift = info.post_shift != 0;
178
179 if (!pre_shift && !increment && !multiply && !post_shift) {
180 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
181 return;
182 }
183
184 Temp pre_shift_dst = a;
185 if (pre_shift) {
186 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
187 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
188 }
189
190 Temp increment_dst = pre_shift_dst;
191 if (increment) {
192 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
193 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
194 }
195
196 Temp multiply_dst = increment_dst;
197 if (multiply) {
198 multiply_dst = post_shift ? bld.tmp(v1) : dst;
199 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
200 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
201 }
202
203 if (post_shift) {
204 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
205 }
206 }
207
208 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
209 {
210 Builder bld(ctx->program, ctx->block);
211 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
212 }
213
214
215 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
216 {
217 /* no need to extract the whole vector */
218 if (src.regClass() == dst_rc) {
219 assert(idx == 0);
220 return src;
221 }
222 assert(src.size() > idx);
223 Builder bld(ctx->program, ctx->block);
224 auto it = ctx->allocated_vec.find(src.id());
225 /* the size check needs to be early because elements other than 0 may be garbage */
226 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
227 if (it->second[idx].regClass() == dst_rc) {
228 return it->second[idx];
229 } else {
230 assert(dst_rc.size() == it->second[idx].regClass().size());
231 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
232 return bld.copy(bld.def(dst_rc), it->second[idx]);
233 }
234 }
235
236 if (src.size() == dst_rc.size()) {
237 assert(idx == 0);
238 return bld.copy(bld.def(dst_rc), src);
239 } else {
240 Temp dst = bld.tmp(dst_rc);
241 emit_extract_vector(ctx, src, idx, dst);
242 return dst;
243 }
244 }
245
246 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
247 {
248 if (num_components == 1)
249 return;
250 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
251 return;
252 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
253 split->operands[0] = Operand(vec_src);
254 std::array<Temp,4> elems;
255 for (unsigned i = 0; i < num_components; i++) {
256 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
257 split->definitions[i] = Definition(elems[i]);
258 }
259 ctx->block->instructions.emplace_back(std::move(split));
260 ctx->allocated_vec.emplace(vec_src.id(), elems);
261 }
262
263 /* This vector expansion uses a mask to determine which elements in the new vector
264 * come from the original vector. The other elements are undefined. */
265 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
266 {
267 emit_split_vector(ctx, vec_src, util_bitcount(mask));
268
269 if (vec_src == dst)
270 return;
271
272 Builder bld(ctx->program, ctx->block);
273 if (num_components == 1) {
274 if (dst.type() == RegType::sgpr)
275 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
276 else
277 bld.copy(Definition(dst), vec_src);
278 return;
279 }
280
281 unsigned component_size = dst.size() / num_components;
282 std::array<Temp,4> elems;
283
284 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
285 vec->definitions[0] = Definition(dst);
286 unsigned k = 0;
287 for (unsigned i = 0; i < num_components; i++) {
288 if (mask & (1 << i)) {
289 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
290 if (dst.type() == RegType::sgpr)
291 src = bld.as_uniform(src);
292 vec->operands[i] = Operand(src);
293 } else {
294 vec->operands[i] = Operand(0u);
295 }
296 elems[i] = vec->operands[i].getTemp();
297 }
298 ctx->block->instructions.emplace_back(std::move(vec));
299 ctx->allocated_vec.emplace(dst.id(), elems);
300 }
301
302 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
303 {
304 if (val.regClass() == s2) {
305 return val;
306 } else {
307 assert(val.regClass() == s1);
308 Builder bld(ctx->program, ctx->block);
309 Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
310 Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
311 if (vcc_hint)
312 def.setHint(vcc);
313 return def.getTemp();
314 }
315 }
316
317 Temp as_uniform_bool(isel_context *ctx, Temp val)
318 {
319 if (val.regClass() == s1) {
320 return val;
321 } else {
322 assert(val.regClass() == s2);
323 Builder bld(ctx->program, ctx->block);
324 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
325 return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), emit_wqm(ctx, val));
326 }
327 }
328
329 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
330 {
331 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
332 return get_ssa_temp(ctx, src.src.ssa);
333
334 if (src.src.ssa->num_components == size) {
335 bool identity_swizzle = true;
336 for (unsigned i = 0; identity_swizzle && i < size; i++) {
337 if (src.swizzle[i] != i)
338 identity_swizzle = false;
339 }
340 if (identity_swizzle)
341 return get_ssa_temp(ctx, src.src.ssa);
342 }
343
344 Temp vec = get_ssa_temp(ctx, src.src.ssa);
345 unsigned elem_size = vec.size() / src.src.ssa->num_components;
346 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
347 assert(vec.size() % elem_size == 0);
348
349 RegClass elem_rc = RegClass(vec.type(), elem_size);
350 if (size == 1) {
351 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
352 } else {
353 assert(size <= 4);
354 std::array<Temp,4> elems;
355 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
356 for (unsigned i = 0; i < size; ++i) {
357 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
358 vec_instr->operands[i] = Operand{elems[i]};
359 }
360 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
361 vec_instr->definitions[0] = Definition(dst);
362 ctx->block->instructions.emplace_back(std::move(vec_instr));
363 ctx->allocated_vec.emplace(dst.id(), elems);
364 return dst;
365 }
366 }
367
368 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
369 {
370 if (ptr.size() == 2)
371 return ptr;
372 Builder bld(ctx->program, ctx->block);
373 if (ptr.type() == RegType::vgpr)
374 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
375 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
376 ptr, Operand((unsigned)ctx->options->address32_hi));
377 }
378
379 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
380 {
381 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
382 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
383 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
384 sop2->definitions[0] = Definition(dst);
385 if (writes_scc)
386 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
387 ctx->block->instructions.emplace_back(std::move(sop2));
388 }
389
390 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
391 {
392 Builder bld(ctx->program, ctx->block);
393 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
394 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
395 if (src1.type() == RegType::sgpr) {
396 if (commutative && src0.type() == RegType::vgpr) {
397 Temp t = src0;
398 src0 = src1;
399 src1 = t;
400 } else if (src0.type() == RegType::vgpr &&
401 op != aco_opcode::v_madmk_f32 &&
402 op != aco_opcode::v_madak_f32 &&
403 op != aco_opcode::v_madmk_f16 &&
404 op != aco_opcode::v_madak_f16) {
405 /* If the instruction is not commutative, we emit a VOP3A instruction */
406 bld.vop2_e64(op, Definition(dst), src0, src1);
407 return;
408 } else {
409 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
410 }
411 }
412 bld.vop2(op, Definition(dst), src0, src1);
413 }
414
415 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
416 {
417 Temp src0 = get_alu_src(ctx, instr->src[0]);
418 Temp src1 = get_alu_src(ctx, instr->src[1]);
419 Temp src2 = get_alu_src(ctx, instr->src[2]);
420
421 /* ensure that the instruction has at most 1 sgpr operand
422 * The optimizer will inline constants for us */
423 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
424 src0 = as_vgpr(ctx, src0);
425 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
426 src1 = as_vgpr(ctx, src1);
427 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
428 src2 = as_vgpr(ctx, src2);
429
430 Builder bld(ctx->program, ctx->block);
431 bld.vop3(op, Definition(dst), src0, src1, src2);
432 }
433
434 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
435 {
436 Builder bld(ctx->program, ctx->block);
437 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
438 }
439
440 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
441 {
442 Temp src0 = get_alu_src(ctx, instr->src[0]);
443 Temp src1 = get_alu_src(ctx, instr->src[1]);
444 aco_ptr<Instruction> vopc;
445 if (src1.type() == RegType::sgpr) {
446 if (src0.type() == RegType::vgpr) {
447 /* to swap the operands, we might also have to change the opcode */
448 switch (op) {
449 case aco_opcode::v_cmp_lt_f32:
450 op = aco_opcode::v_cmp_gt_f32;
451 break;
452 case aco_opcode::v_cmp_ge_f32:
453 op = aco_opcode::v_cmp_le_f32;
454 break;
455 case aco_opcode::v_cmp_lt_i32:
456 op = aco_opcode::v_cmp_gt_i32;
457 break;
458 case aco_opcode::v_cmp_ge_i32:
459 op = aco_opcode::v_cmp_le_i32;
460 break;
461 case aco_opcode::v_cmp_lt_u32:
462 op = aco_opcode::v_cmp_gt_u32;
463 break;
464 case aco_opcode::v_cmp_ge_u32:
465 op = aco_opcode::v_cmp_le_u32;
466 break;
467 case aco_opcode::v_cmp_lt_f64:
468 op = aco_opcode::v_cmp_gt_f64;
469 break;
470 case aco_opcode::v_cmp_ge_f64:
471 op = aco_opcode::v_cmp_le_f64;
472 break;
473 case aco_opcode::v_cmp_lt_i64:
474 op = aco_opcode::v_cmp_gt_i64;
475 break;
476 case aco_opcode::v_cmp_ge_i64:
477 op = aco_opcode::v_cmp_le_i64;
478 break;
479 case aco_opcode::v_cmp_lt_u64:
480 op = aco_opcode::v_cmp_gt_u64;
481 break;
482 case aco_opcode::v_cmp_ge_u64:
483 op = aco_opcode::v_cmp_le_u64;
484 break;
485 default: /* eq and ne are commutative */
486 break;
487 }
488 Temp t = src0;
489 src0 = src1;
490 src1 = t;
491 } else {
492 src1 = as_vgpr(ctx, src1);
493 }
494 }
495 Builder bld(ctx->program, ctx->block);
496 bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
497 }
498
499 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
500 {
501 if (dst.regClass() == s2) {
502 emit_vopc_instruction(ctx, instr, op, dst);
503 if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
504 emit_split_vector(ctx, dst, 2);
505 } else if (dst.regClass() == s1) {
506 Temp src0 = get_alu_src(ctx, instr->src[0]);
507 Temp src1 = get_alu_src(ctx, instr->src[1]);
508 assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
509
510 Builder bld(ctx->program, ctx->block);
511 bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
512
513 } else {
514 assert(false);
515 }
516 }
517
518 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
519 {
520 Builder bld(ctx->program, ctx->block);
521 Temp src0 = get_alu_src(ctx, instr->src[0]);
522 Temp src1 = get_alu_src(ctx, instr->src[1]);
523 if (dst.regClass() == s2) {
524 bld.sop2(op64, Definition(dst), bld.def(s1, scc),
525 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
526 } else {
527 assert(dst.regClass() == s1);
528 bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
529 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
530 }
531 }
532
533
534 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
535 {
536 Builder bld(ctx->program, ctx->block);
537 Temp cond = get_alu_src(ctx, instr->src[0]);
538 Temp then = get_alu_src(ctx, instr->src[1]);
539 Temp els = get_alu_src(ctx, instr->src[2]);
540
541 if (dst.type() == RegType::vgpr) {
542 cond = as_divergent_bool(ctx, cond, true);
543
544 aco_ptr<Instruction> bcsel;
545 if (dst.size() == 1) {
546 then = as_vgpr(ctx, then);
547 els = as_vgpr(ctx, els);
548
549 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
550 } else if (dst.size() == 2) {
551 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
552 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
553 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
554 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
555
556 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
557 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
558
559 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
560 } else {
561 fprintf(stderr, "Unimplemented NIR instr bit size: ");
562 nir_print_instr(&instr->instr, stderr);
563 fprintf(stderr, "\n");
564 }
565 return;
566 }
567
568 if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
569 if (dst.regClass() == s1 || dst.regClass() == s2) {
570 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
571 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
572 bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
573 } else {
574 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
575 nir_print_instr(&instr->instr, stderr);
576 fprintf(stderr, "\n");
577 }
578 return;
579 }
580
581 /* boolean bcsel */
582 assert(instr->dest.dest.ssa.bit_size == 1);
583
584 if (dst.regClass() == s1)
585 cond = as_uniform_bool(ctx, cond);
586
587 if (cond.regClass() == s1) { /* uniform selection */
588 aco_opcode op;
589 if (dst.regClass() == s2) {
590 op = aco_opcode::s_cselect_b64;
591 then = as_divergent_bool(ctx, then, false);
592 els = as_divergent_bool(ctx, els, false);
593 } else {
594 assert(dst.regClass() == s1);
595 op = aco_opcode::s_cselect_b32;
596 then = as_uniform_bool(ctx, then);
597 els = as_uniform_bool(ctx, els);
598 }
599 bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
600 return;
601 }
602
603 /* divergent boolean bcsel
604 * this implements bcsel on bools: dst = s0 ? s1 : s2
605 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
606 assert (dst.regClass() == s2);
607 then = as_divergent_bool(ctx, then, false);
608 els = as_divergent_bool(ctx, els, false);
609
610 if (cond.id() != then.id())
611 then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
612
613 if (cond.id() == els.id())
614 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
615 else
616 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
617 bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
618 }
619
620 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
621 {
622 if (!instr->dest.dest.is_ssa) {
623 fprintf(stderr, "nir alu dst not in ssa: ");
624 nir_print_instr(&instr->instr, stderr);
625 fprintf(stderr, "\n");
626 abort();
627 }
628 Builder bld(ctx->program, ctx->block);
629 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
630 switch(instr->op) {
631 case nir_op_vec2:
632 case nir_op_vec3:
633 case nir_op_vec4: {
634 std::array<Temp,4> elems;
635 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
636 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
637 elems[i] = get_alu_src(ctx, instr->src[i]);
638 vec->operands[i] = Operand{elems[i]};
639 }
640 vec->definitions[0] = Definition(dst);
641 ctx->block->instructions.emplace_back(std::move(vec));
642 ctx->allocated_vec.emplace(dst.id(), elems);
643 break;
644 }
645 case nir_op_mov: {
646 Temp src = get_alu_src(ctx, instr->src[0]);
647 aco_ptr<Instruction> mov;
648 if (dst.type() == RegType::sgpr) {
649 if (src.type() == RegType::vgpr)
650 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
651 else if (src.regClass() == s1)
652 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
653 else if (src.regClass() == s2)
654 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
655 else
656 unreachable("wrong src register class for nir_op_imov");
657 } else if (dst.regClass() == v1) {
658 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
659 } else if (dst.regClass() == v2) {
660 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
661 } else {
662 nir_print_instr(&instr->instr, stderr);
663 unreachable("Should have been lowered to scalar.");
664 }
665 break;
666 }
667 case nir_op_inot: {
668 Temp src = get_alu_src(ctx, instr->src[0]);
669 /* uniform booleans */
670 if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
671 if (src.regClass() == s1) {
672 /* in this case, src is either 1 or 0 */
673 bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
674 } else {
675 /* src is either exec_mask or 0 */
676 assert(src.regClass() == s2);
677 bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
678 }
679 } else if (dst.regClass() == v1) {
680 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
681 } else if (dst.type() == RegType::sgpr) {
682 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
683 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
684 } else {
685 fprintf(stderr, "Unimplemented NIR instr bit size: ");
686 nir_print_instr(&instr->instr, stderr);
687 fprintf(stderr, "\n");
688 }
689 break;
690 }
691 case nir_op_ineg: {
692 Temp src = get_alu_src(ctx, instr->src[0]);
693 if (dst.regClass() == v1) {
694 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
695 } else if (dst.regClass() == s1) {
696 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
697 } else if (dst.size() == 2) {
698 Temp src0 = bld.tmp(dst.type(), 1);
699 Temp src1 = bld.tmp(dst.type(), 1);
700 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
701
702 if (dst.regClass() == s2) {
703 Temp carry = bld.tmp(s1);
704 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
705 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
706 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
707 } else {
708 Temp lower = bld.tmp(v1);
709 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
710 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
711 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
712 }
713 } else {
714 fprintf(stderr, "Unimplemented NIR instr bit size: ");
715 nir_print_instr(&instr->instr, stderr);
716 fprintf(stderr, "\n");
717 }
718 break;
719 }
720 case nir_op_iabs: {
721 if (dst.regClass() == s1) {
722 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
723 } else if (dst.regClass() == v1) {
724 Temp src = get_alu_src(ctx, instr->src[0]);
725 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
726 } else {
727 fprintf(stderr, "Unimplemented NIR instr bit size: ");
728 nir_print_instr(&instr->instr, stderr);
729 fprintf(stderr, "\n");
730 }
731 break;
732 }
733 case nir_op_isign: {
734 Temp src = get_alu_src(ctx, instr->src[0]);
735 if (dst.regClass() == s1) {
736 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
737 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
738 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
739 } else if (dst.regClass() == s2) {
740 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
741 Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
742 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
743 } else if (dst.regClass() == v1) {
744 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
745 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
746 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
747 } else if (dst.regClass() == v2) {
748 Temp upper = emit_extract_vector(ctx, src, 1, v1);
749 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
750 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
751 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
752 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
753 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
754 } else {
755 fprintf(stderr, "Unimplemented NIR instr bit size: ");
756 nir_print_instr(&instr->instr, stderr);
757 fprintf(stderr, "\n");
758 }
759 break;
760 }
761 case nir_op_imax: {
762 if (dst.regClass() == v1) {
763 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
764 } else if (dst.regClass() == s1) {
765 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
766 } else {
767 fprintf(stderr, "Unimplemented NIR instr bit size: ");
768 nir_print_instr(&instr->instr, stderr);
769 fprintf(stderr, "\n");
770 }
771 break;
772 }
773 case nir_op_umax: {
774 if (dst.regClass() == v1) {
775 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
776 } else if (dst.regClass() == s1) {
777 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
778 } else {
779 fprintf(stderr, "Unimplemented NIR instr bit size: ");
780 nir_print_instr(&instr->instr, stderr);
781 fprintf(stderr, "\n");
782 }
783 break;
784 }
785 case nir_op_imin: {
786 if (dst.regClass() == v1) {
787 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
788 } else if (dst.regClass() == s1) {
789 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
790 } else {
791 fprintf(stderr, "Unimplemented NIR instr bit size: ");
792 nir_print_instr(&instr->instr, stderr);
793 fprintf(stderr, "\n");
794 }
795 break;
796 }
797 case nir_op_umin: {
798 if (dst.regClass() == v1) {
799 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
800 } else if (dst.regClass() == s1) {
801 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
802 } else {
803 fprintf(stderr, "Unimplemented NIR instr bit size: ");
804 nir_print_instr(&instr->instr, stderr);
805 fprintf(stderr, "\n");
806 }
807 break;
808 }
809 case nir_op_ior: {
810 if (instr->dest.dest.ssa.bit_size == 1) {
811 emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
812 } else if (dst.regClass() == v1) {
813 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
814 } else if (dst.regClass() == s1) {
815 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
816 } else if (dst.regClass() == s2) {
817 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
818 } else {
819 fprintf(stderr, "Unimplemented NIR instr bit size: ");
820 nir_print_instr(&instr->instr, stderr);
821 fprintf(stderr, "\n");
822 }
823 break;
824 }
825 case nir_op_iand: {
826 if (instr->dest.dest.ssa.bit_size == 1) {
827 emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
828 } else if (dst.regClass() == v1) {
829 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
830 } else if (dst.regClass() == s1) {
831 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
832 } else if (dst.regClass() == s2) {
833 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
834 } else {
835 fprintf(stderr, "Unimplemented NIR instr bit size: ");
836 nir_print_instr(&instr->instr, stderr);
837 fprintf(stderr, "\n");
838 }
839 break;
840 }
841 case nir_op_ixor: {
842 if (instr->dest.dest.ssa.bit_size == 1) {
843 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
844 } else if (dst.regClass() == v1) {
845 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
846 } else if (dst.regClass() == s1) {
847 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
848 } else if (dst.regClass() == s2) {
849 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
850 } else {
851 fprintf(stderr, "Unimplemented NIR instr bit size: ");
852 nir_print_instr(&instr->instr, stderr);
853 fprintf(stderr, "\n");
854 }
855 break;
856 }
857 case nir_op_ushr: {
858 if (dst.regClass() == v1) {
859 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
860 } else if (dst.regClass() == v2) {
861 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
862 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
863 } else if (dst.regClass() == s2) {
864 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
865 } else if (dst.regClass() == s1) {
866 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
867 } else {
868 fprintf(stderr, "Unimplemented NIR instr bit size: ");
869 nir_print_instr(&instr->instr, stderr);
870 fprintf(stderr, "\n");
871 }
872 break;
873 }
874 case nir_op_ishl: {
875 if (dst.regClass() == v1) {
876 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
877 } else if (dst.regClass() == v2) {
878 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
879 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
880 } else if (dst.regClass() == s1) {
881 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
882 } else if (dst.regClass() == s2) {
883 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
884 } else {
885 fprintf(stderr, "Unimplemented NIR instr bit size: ");
886 nir_print_instr(&instr->instr, stderr);
887 fprintf(stderr, "\n");
888 }
889 break;
890 }
891 case nir_op_ishr: {
892 if (dst.regClass() == v1) {
893 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
894 } else if (dst.regClass() == v2) {
895 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
896 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
897 } else if (dst.regClass() == s1) {
898 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
899 } else if (dst.regClass() == s2) {
900 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
901 } else {
902 fprintf(stderr, "Unimplemented NIR instr bit size: ");
903 nir_print_instr(&instr->instr, stderr);
904 fprintf(stderr, "\n");
905 }
906 break;
907 }
908 case nir_op_find_lsb: {
909 Temp src = get_alu_src(ctx, instr->src[0]);
910 if (src.regClass() == s1) {
911 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
912 } else if (src.regClass() == v1) {
913 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
914 } else if (src.regClass() == s2) {
915 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
916 } else {
917 fprintf(stderr, "Unimplemented NIR instr bit size: ");
918 nir_print_instr(&instr->instr, stderr);
919 fprintf(stderr, "\n");
920 }
921 break;
922 }
923 case nir_op_ufind_msb:
924 case nir_op_ifind_msb: {
925 Temp src = get_alu_src(ctx, instr->src[0]);
926 if (src.regClass() == s1 || src.regClass() == s2) {
927 aco_opcode op = src.regClass() == s2 ?
928 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
929 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
930 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
931
932 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
933 Operand(src.size() * 32u - 1u), msb_rev);
934 Temp msb = sub.def(0).getTemp();
935 Temp carry = sub.def(1).getTemp();
936
937 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
938 } else if (src.regClass() == v1) {
939 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
940 Temp msb_rev = bld.tmp(v1);
941 emit_vop1_instruction(ctx, instr, op, msb_rev);
942 Temp msb = bld.tmp(v1);
943 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
944 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
945 } else {
946 fprintf(stderr, "Unimplemented NIR instr bit size: ");
947 nir_print_instr(&instr->instr, stderr);
948 fprintf(stderr, "\n");
949 }
950 break;
951 }
952 case nir_op_bitfield_reverse: {
953 if (dst.regClass() == s1) {
954 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
955 } else if (dst.regClass() == v1) {
956 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
957 } else {
958 fprintf(stderr, "Unimplemented NIR instr bit size: ");
959 nir_print_instr(&instr->instr, stderr);
960 fprintf(stderr, "\n");
961 }
962 break;
963 }
964 case nir_op_iadd: {
965 if (dst.regClass() == s1) {
966 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
967 break;
968 }
969
970 Temp src0 = get_alu_src(ctx, instr->src[0]);
971 Temp src1 = get_alu_src(ctx, instr->src[1]);
972 if (dst.regClass() == v1) {
973 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
974 break;
975 }
976
977 assert(src0.size() == 2 && src1.size() == 2);
978 Temp src00 = bld.tmp(src0.type(), 1);
979 Temp src01 = bld.tmp(dst.type(), 1);
980 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
981 Temp src10 = bld.tmp(src1.type(), 1);
982 Temp src11 = bld.tmp(dst.type(), 1);
983 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
984
985 if (dst.regClass() == s2) {
986 Temp carry = bld.tmp(s1);
987 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
988 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
989 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
990 } else if (dst.regClass() == v2) {
991 Temp dst0 = bld.tmp(v1);
992 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
993 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
994 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
995 } else {
996 fprintf(stderr, "Unimplemented NIR instr bit size: ");
997 nir_print_instr(&instr->instr, stderr);
998 fprintf(stderr, "\n");
999 }
1000 break;
1001 }
1002 case nir_op_uadd_sat: {
1003 Temp src0 = get_alu_src(ctx, instr->src[0]);
1004 Temp src1 = get_alu_src(ctx, instr->src[1]);
1005 if (dst.regClass() == s1) {
1006 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1007 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1008 src0, src1);
1009 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1010 } else if (dst.regClass() == v1) {
1011 if (ctx->options->chip_class >= GFX9) {
1012 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1013 add->operands[0] = Operand(src0);
1014 add->operands[1] = Operand(src1);
1015 add->definitions[0] = Definition(dst);
1016 add->clamp = 1;
1017 ctx->block->instructions.emplace_back(std::move(add));
1018 } else {
1019 if (src1.regClass() != v1)
1020 std::swap(src0, src1);
1021 assert(src1.regClass() == v1);
1022 Temp tmp = bld.tmp(v1);
1023 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1024 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1025 }
1026 } else {
1027 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1028 nir_print_instr(&instr->instr, stderr);
1029 fprintf(stderr, "\n");
1030 }
1031 break;
1032 }
1033 case nir_op_uadd_carry: {
1034 Temp src0 = get_alu_src(ctx, instr->src[0]);
1035 Temp src1 = get_alu_src(ctx, instr->src[1]);
1036 if (dst.regClass() == s1) {
1037 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1038 break;
1039 }
1040 if (dst.regClass() == v1) {
1041 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1042 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1043 break;
1044 }
1045
1046 Temp src00 = bld.tmp(src0.type(), 1);
1047 Temp src01 = bld.tmp(dst.type(), 1);
1048 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1049 Temp src10 = bld.tmp(src1.type(), 1);
1050 Temp src11 = bld.tmp(dst.type(), 1);
1051 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1052 if (dst.regClass() == s2) {
1053 Temp carry = bld.tmp(s1);
1054 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1055 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1056 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1057 } else if (dst.regClass() == v2) {
1058 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1059 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1060 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1061 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1062 } else {
1063 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1064 nir_print_instr(&instr->instr, stderr);
1065 fprintf(stderr, "\n");
1066 }
1067 break;
1068 }
1069 case nir_op_isub: {
1070 if (dst.regClass() == s1) {
1071 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1072 break;
1073 }
1074
1075 Temp src0 = get_alu_src(ctx, instr->src[0]);
1076 Temp src1 = get_alu_src(ctx, instr->src[1]);
1077 if (dst.regClass() == v1) {
1078 bld.vsub32(Definition(dst), src0, src1);
1079 break;
1080 }
1081
1082 Temp src00 = bld.tmp(src0.type(), 1);
1083 Temp src01 = bld.tmp(dst.type(), 1);
1084 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1085 Temp src10 = bld.tmp(src1.type(), 1);
1086 Temp src11 = bld.tmp(dst.type(), 1);
1087 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1088 if (dst.regClass() == s2) {
1089 Temp carry = bld.tmp(s1);
1090 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1091 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1092 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1093 } else if (dst.regClass() == v2) {
1094 Temp lower = bld.tmp(v1);
1095 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1096 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1097 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1098 } else {
1099 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1100 nir_print_instr(&instr->instr, stderr);
1101 fprintf(stderr, "\n");
1102 }
1103 break;
1104 }
1105 case nir_op_usub_borrow: {
1106 Temp src0 = get_alu_src(ctx, instr->src[0]);
1107 Temp src1 = get_alu_src(ctx, instr->src[1]);
1108 if (dst.regClass() == s1) {
1109 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1110 break;
1111 } else if (dst.regClass() == v1) {
1112 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1113 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1114 break;
1115 }
1116
1117 Temp src00 = bld.tmp(src0.type(), 1);
1118 Temp src01 = bld.tmp(dst.type(), 1);
1119 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1120 Temp src10 = bld.tmp(src1.type(), 1);
1121 Temp src11 = bld.tmp(dst.type(), 1);
1122 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1123 if (dst.regClass() == s2) {
1124 Temp borrow = bld.tmp(s1);
1125 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1126 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1127 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1128 } else if (dst.regClass() == v2) {
1129 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1130 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1131 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1132 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1133 } else {
1134 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1135 nir_print_instr(&instr->instr, stderr);
1136 fprintf(stderr, "\n");
1137 }
1138 break;
1139 }
1140 case nir_op_imul: {
1141 if (dst.regClass() == v1) {
1142 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1143 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1144 } else if (dst.regClass() == s1) {
1145 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1146 } else {
1147 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1148 nir_print_instr(&instr->instr, stderr);
1149 fprintf(stderr, "\n");
1150 }
1151 break;
1152 }
1153 case nir_op_umul_high: {
1154 if (dst.regClass() == v1) {
1155 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1156 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1157 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1158 } else if (dst.regClass() == s1) {
1159 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1160 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1161 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1162 } else {
1163 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1164 nir_print_instr(&instr->instr, stderr);
1165 fprintf(stderr, "\n");
1166 }
1167 break;
1168 }
1169 case nir_op_imul_high: {
1170 if (dst.regClass() == v1) {
1171 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1172 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1173 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1174 } else if (dst.regClass() == s1) {
1175 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1176 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1177 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1178 } else {
1179 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1180 nir_print_instr(&instr->instr, stderr);
1181 fprintf(stderr, "\n");
1182 }
1183 break;
1184 }
1185 case nir_op_fmul: {
1186 if (dst.size() == 1) {
1187 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1188 } else if (dst.size() == 2) {
1189 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1190 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1191 } else {
1192 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1193 nir_print_instr(&instr->instr, stderr);
1194 fprintf(stderr, "\n");
1195 }
1196 break;
1197 }
1198 case nir_op_fadd: {
1199 if (dst.size() == 1) {
1200 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1201 } else if (dst.size() == 2) {
1202 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1203 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1204 } else {
1205 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1206 nir_print_instr(&instr->instr, stderr);
1207 fprintf(stderr, "\n");
1208 }
1209 break;
1210 }
1211 case nir_op_fsub: {
1212 Temp src0 = get_alu_src(ctx, instr->src[0]);
1213 Temp src1 = get_alu_src(ctx, instr->src[1]);
1214 if (dst.size() == 1) {
1215 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1216 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1217 else
1218 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1219 } else if (dst.size() == 2) {
1220 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1221 get_alu_src(ctx, instr->src[0]),
1222 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1223 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1224 sub->neg[1] = true;
1225 } else {
1226 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227 nir_print_instr(&instr->instr, stderr);
1228 fprintf(stderr, "\n");
1229 }
1230 break;
1231 }
1232 case nir_op_fmax: {
1233 if (dst.size() == 1) {
1234 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1235 } else if (dst.size() == 2) {
1236 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1237 get_alu_src(ctx, instr->src[0]),
1238 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1239 } else {
1240 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1241 nir_print_instr(&instr->instr, stderr);
1242 fprintf(stderr, "\n");
1243 }
1244 break;
1245 }
1246 case nir_op_fmin: {
1247 if (dst.size() == 1) {
1248 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1249 } else if (dst.size() == 2) {
1250 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1251 get_alu_src(ctx, instr->src[0]),
1252 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1253 } else {
1254 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1255 nir_print_instr(&instr->instr, stderr);
1256 fprintf(stderr, "\n");
1257 }
1258 break;
1259 }
1260 case nir_op_fmax3: {
1261 if (dst.size() == 1) {
1262 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1263 } else {
1264 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1265 nir_print_instr(&instr->instr, stderr);
1266 fprintf(stderr, "\n");
1267 }
1268 break;
1269 }
1270 case nir_op_fmin3: {
1271 if (dst.size() == 1) {
1272 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1273 } else {
1274 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1275 nir_print_instr(&instr->instr, stderr);
1276 fprintf(stderr, "\n");
1277 }
1278 break;
1279 }
1280 case nir_op_fmed3: {
1281 if (dst.size() == 1) {
1282 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_umax3: {
1291 if (dst.size() == 1) {
1292 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1293 } else {
1294 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1295 nir_print_instr(&instr->instr, stderr);
1296 fprintf(stderr, "\n");
1297 }
1298 break;
1299 }
1300 case nir_op_umin3: {
1301 if (dst.size() == 1) {
1302 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1303 } else {
1304 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1305 nir_print_instr(&instr->instr, stderr);
1306 fprintf(stderr, "\n");
1307 }
1308 break;
1309 }
1310 case nir_op_umed3: {
1311 if (dst.size() == 1) {
1312 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1313 } else {
1314 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1315 nir_print_instr(&instr->instr, stderr);
1316 fprintf(stderr, "\n");
1317 }
1318 break;
1319 }
1320 case nir_op_imax3: {
1321 if (dst.size() == 1) {
1322 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1323 } else {
1324 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1325 nir_print_instr(&instr->instr, stderr);
1326 fprintf(stderr, "\n");
1327 }
1328 break;
1329 }
1330 case nir_op_imin3: {
1331 if (dst.size() == 1) {
1332 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1333 } else {
1334 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1335 nir_print_instr(&instr->instr, stderr);
1336 fprintf(stderr, "\n");
1337 }
1338 break;
1339 }
1340 case nir_op_imed3: {
1341 if (dst.size() == 1) {
1342 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1343 } else {
1344 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1345 nir_print_instr(&instr->instr, stderr);
1346 fprintf(stderr, "\n");
1347 }
1348 break;
1349 }
1350 case nir_op_cube_face_coord: {
1351 Temp in = get_alu_src(ctx, instr->src[0], 3);
1352 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1353 emit_extract_vector(ctx, in, 1, v1),
1354 emit_extract_vector(ctx, in, 2, v1) };
1355 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1356 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1357 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1358 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1359 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1360 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1361 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1362 break;
1363 }
1364 case nir_op_cube_face_index: {
1365 Temp in = get_alu_src(ctx, instr->src[0], 3);
1366 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1367 emit_extract_vector(ctx, in, 1, v1),
1368 emit_extract_vector(ctx, in, 2, v1) };
1369 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1370 break;
1371 }
1372 case nir_op_bcsel: {
1373 emit_bcsel(ctx, instr, dst);
1374 break;
1375 }
1376 case nir_op_frsq: {
1377 if (dst.size() == 1) {
1378 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1379 } else if (dst.size() == 2) {
1380 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1381 } else {
1382 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1383 nir_print_instr(&instr->instr, stderr);
1384 fprintf(stderr, "\n");
1385 }
1386 break;
1387 }
1388 case nir_op_fneg: {
1389 Temp src = get_alu_src(ctx, instr->src[0]);
1390 if (dst.size() == 1) {
1391 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1392 } else if (dst.size() == 2) {
1393 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1394 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1395 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1396 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1397 } else {
1398 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1399 nir_print_instr(&instr->instr, stderr);
1400 fprintf(stderr, "\n");
1401 }
1402 break;
1403 }
1404 case nir_op_fabs: {
1405 Temp src = get_alu_src(ctx, instr->src[0]);
1406 if (dst.size() == 1) {
1407 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1408 } else if (dst.size() == 2) {
1409 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1410 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1411 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1412 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1413 } else {
1414 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1415 nir_print_instr(&instr->instr, stderr);
1416 fprintf(stderr, "\n");
1417 }
1418 break;
1419 }
1420 case nir_op_fsat: {
1421 Temp src = get_alu_src(ctx, instr->src[0]);
1422 if (dst.size() == 1) {
1423 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1424 } else if (dst.size() == 2) {
1425 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1426 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1427 vop3->clamp = true;
1428 } else {
1429 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1430 nir_print_instr(&instr->instr, stderr);
1431 fprintf(stderr, "\n");
1432 }
1433 break;
1434 }
1435 case nir_op_flog2: {
1436 if (dst.size() == 1) {
1437 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1438 } else {
1439 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1440 nir_print_instr(&instr->instr, stderr);
1441 fprintf(stderr, "\n");
1442 }
1443 break;
1444 }
1445 case nir_op_frcp: {
1446 if (dst.size() == 1) {
1447 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1448 } else if (dst.size() == 2) {
1449 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1450 } else {
1451 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1452 nir_print_instr(&instr->instr, stderr);
1453 fprintf(stderr, "\n");
1454 }
1455 break;
1456 }
1457 case nir_op_fexp2: {
1458 if (dst.size() == 1) {
1459 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1460 } else {
1461 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1462 nir_print_instr(&instr->instr, stderr);
1463 fprintf(stderr, "\n");
1464 }
1465 break;
1466 }
1467 case nir_op_fsqrt: {
1468 if (dst.size() == 1) {
1469 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1470 } else if (dst.size() == 2) {
1471 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1472 } else {
1473 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1474 nir_print_instr(&instr->instr, stderr);
1475 fprintf(stderr, "\n");
1476 }
1477 break;
1478 }
1479 case nir_op_ffract: {
1480 if (dst.size() == 1) {
1481 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1482 } else if (dst.size() == 2) {
1483 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1484 } else {
1485 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1486 nir_print_instr(&instr->instr, stderr);
1487 fprintf(stderr, "\n");
1488 }
1489 break;
1490 }
1491 case nir_op_ffloor: {
1492 if (dst.size() == 1) {
1493 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1494 } else if (dst.size() == 2) {
1495 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1496 } else {
1497 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1498 nir_print_instr(&instr->instr, stderr);
1499 fprintf(stderr, "\n");
1500 }
1501 break;
1502 }
1503 case nir_op_fceil: {
1504 if (dst.size() == 1) {
1505 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1506 } else if (dst.size() == 2) {
1507 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1508 } else {
1509 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1510 nir_print_instr(&instr->instr, stderr);
1511 fprintf(stderr, "\n");
1512 }
1513 break;
1514 }
1515 case nir_op_ftrunc: {
1516 if (dst.size() == 1) {
1517 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1518 } else if (dst.size() == 2) {
1519 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1520 } else {
1521 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1522 nir_print_instr(&instr->instr, stderr);
1523 fprintf(stderr, "\n");
1524 }
1525 break;
1526 }
1527 case nir_op_fround_even: {
1528 if (dst.size() == 1) {
1529 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1530 } else if (dst.size() == 2) {
1531 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1532 } else {
1533 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1534 nir_print_instr(&instr->instr, stderr);
1535 fprintf(stderr, "\n");
1536 }
1537 break;
1538 }
1539 case nir_op_fsin:
1540 case nir_op_fcos: {
1541 Temp src = get_alu_src(ctx, instr->src[0]);
1542 aco_ptr<Instruction> norm;
1543 if (dst.size() == 1) {
1544 Temp tmp;
1545 Operand half_pi(0x3e22f983u);
1546 if (src.type() == RegType::sgpr)
1547 tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1548 else
1549 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1550
1551 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1552 if (ctx->options->chip_class < GFX9)
1553 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1554
1555 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1556 bld.vop1(opcode, Definition(dst), tmp);
1557 } else {
1558 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559 nir_print_instr(&instr->instr, stderr);
1560 fprintf(stderr, "\n");
1561 }
1562 break;
1563 }
1564 case nir_op_ldexp: {
1565 if (dst.size() == 1) {
1566 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1567 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1568 get_alu_src(ctx, instr->src[1]));
1569 } else if (dst.size() == 2) {
1570 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1571 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1572 get_alu_src(ctx, instr->src[1]));
1573 } else {
1574 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1575 nir_print_instr(&instr->instr, stderr);
1576 fprintf(stderr, "\n");
1577 }
1578 break;
1579 }
1580 case nir_op_frexp_sig: {
1581 if (dst.size() == 1) {
1582 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1583 get_alu_src(ctx, instr->src[0]));
1584 } else if (dst.size() == 2) {
1585 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1586 get_alu_src(ctx, instr->src[0]));
1587 } else {
1588 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1589 nir_print_instr(&instr->instr, stderr);
1590 fprintf(stderr, "\n");
1591 }
1592 break;
1593 }
1594 case nir_op_frexp_exp: {
1595 if (instr->src[0].src.ssa->bit_size == 32) {
1596 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1597 get_alu_src(ctx, instr->src[0]));
1598 } else if (instr->src[0].src.ssa->bit_size == 64) {
1599 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1600 get_alu_src(ctx, instr->src[0]));
1601 } else {
1602 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1603 nir_print_instr(&instr->instr, stderr);
1604 fprintf(stderr, "\n");
1605 }
1606 break;
1607 }
1608 case nir_op_fsign: {
1609 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1610 if (dst.size() == 1) {
1611 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1612 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1613 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1614 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1615 } else if (dst.size() == 2) {
1616 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1617 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1618 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1619
1620 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1621 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1622 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1623
1624 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1625 } else {
1626 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1627 nir_print_instr(&instr->instr, stderr);
1628 fprintf(stderr, "\n");
1629 }
1630 break;
1631 }
1632 case nir_op_f2f32: {
1633 if (instr->src[0].src.ssa->bit_size == 64) {
1634 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1635 } else {
1636 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1637 nir_print_instr(&instr->instr, stderr);
1638 fprintf(stderr, "\n");
1639 }
1640 break;
1641 }
1642 case nir_op_f2f64: {
1643 if (instr->src[0].src.ssa->bit_size == 32) {
1644 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1645 } else {
1646 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1647 nir_print_instr(&instr->instr, stderr);
1648 fprintf(stderr, "\n");
1649 }
1650 break;
1651 }
1652 case nir_op_i2f32: {
1653 assert(dst.size() == 1);
1654 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1655 break;
1656 }
1657 case nir_op_i2f64: {
1658 if (instr->src[0].src.ssa->bit_size == 32) {
1659 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1660 } else if (instr->src[0].src.ssa->bit_size == 64) {
1661 Temp src = get_alu_src(ctx, instr->src[0]);
1662 RegClass rc = RegClass(src.type(), 1);
1663 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1664 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1665 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1666 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1667 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1668 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1669
1670 } else {
1671 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1672 nir_print_instr(&instr->instr, stderr);
1673 fprintf(stderr, "\n");
1674 }
1675 break;
1676 }
1677 case nir_op_u2f32: {
1678 assert(dst.size() == 1);
1679 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1680 break;
1681 }
1682 case nir_op_u2f64: {
1683 if (instr->src[0].src.ssa->bit_size == 32) {
1684 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1685 } else if (instr->src[0].src.ssa->bit_size == 64) {
1686 Temp src = get_alu_src(ctx, instr->src[0]);
1687 RegClass rc = RegClass(src.type(), 1);
1688 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1689 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1690 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1691 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1692 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1693 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1694 } else {
1695 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1696 nir_print_instr(&instr->instr, stderr);
1697 fprintf(stderr, "\n");
1698 }
1699 break;
1700 }
1701 case nir_op_f2i32: {
1702 Temp src = get_alu_src(ctx, instr->src[0]);
1703 if (instr->src[0].src.ssa->bit_size == 32) {
1704 if (dst.type() == RegType::vgpr)
1705 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1706 else
1707 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1708 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1709
1710 } else if (instr->src[0].src.ssa->bit_size == 64) {
1711 if (dst.type() == RegType::vgpr)
1712 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1713 else
1714 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1715 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1716
1717 } else {
1718 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1719 nir_print_instr(&instr->instr, stderr);
1720 fprintf(stderr, "\n");
1721 }
1722 break;
1723 }
1724 case nir_op_f2u32: {
1725 Temp src = get_alu_src(ctx, instr->src[0]);
1726 if (instr->src[0].src.ssa->bit_size == 32) {
1727 if (dst.type() == RegType::vgpr)
1728 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1729 else
1730 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1731 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1732
1733 } else if (instr->src[0].src.ssa->bit_size == 64) {
1734 if (dst.type() == RegType::vgpr)
1735 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1736 else
1737 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1738 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1739
1740 } else {
1741 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1742 nir_print_instr(&instr->instr, stderr);
1743 fprintf(stderr, "\n");
1744 }
1745 break;
1746 }
1747 case nir_op_f2i64: {
1748 Temp src = get_alu_src(ctx, instr->src[0]);
1749 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1750 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1751 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1752 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1753 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1754 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1755 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1756 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1757 Temp new_exponent = bld.tmp(v1);
1758 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1759 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1760 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1761 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1762 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1763 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1764 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1765 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1766 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1767 Temp new_lower = bld.tmp(v1);
1768 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1769 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1770 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1771
1772 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1773 if (src.type() == RegType::vgpr)
1774 src = bld.as_uniform(src);
1775 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1776 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1777 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1778 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1779 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1780 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1781 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1782 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1783 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1784 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1785 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1786 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1787 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1788 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1789 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1790 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1791 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1792 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1793 Temp borrow = bld.tmp(s1);
1794 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1795 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1796 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1797
1798 } else if (instr->src[0].src.ssa->bit_size == 64) {
1799 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1800 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1801 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1802 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1803 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1804 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1805 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1806 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1807 if (dst.type() == RegType::sgpr) {
1808 lower = bld.as_uniform(lower);
1809 upper = bld.as_uniform(upper);
1810 }
1811 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1812
1813 } else {
1814 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1815 nir_print_instr(&instr->instr, stderr);
1816 fprintf(stderr, "\n");
1817 }
1818 break;
1819 }
1820 case nir_op_f2u64: {
1821 Temp src = get_alu_src(ctx, instr->src[0]);
1822 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1823 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1824 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1825 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1826 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1827 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1828 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1829 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1830 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1831 Temp new_exponent = bld.tmp(v1);
1832 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1833 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1834 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1835 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1836 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1837 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1838 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1839 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1840 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1841
1842 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1843 if (src.type() == RegType::vgpr)
1844 src = bld.as_uniform(src);
1845 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1846 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1847 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1848 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1849 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1850 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1851 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1852 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1853 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1854 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1855 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1856 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1857 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1858 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1859 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1860 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1861 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1862 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1863
1864 } else if (instr->src[0].src.ssa->bit_size == 64) {
1865 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1866 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1867 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1868 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1869 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1870 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1871 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1872 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1873 if (dst.type() == RegType::sgpr) {
1874 lower = bld.as_uniform(lower);
1875 upper = bld.as_uniform(upper);
1876 }
1877 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1878
1879 } else {
1880 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1881 nir_print_instr(&instr->instr, stderr);
1882 fprintf(stderr, "\n");
1883 }
1884 break;
1885 }
1886 case nir_op_b2f32: {
1887 Temp src = get_alu_src(ctx, instr->src[0]);
1888 if (dst.regClass() == s1) {
1889 src = as_uniform_bool(ctx, src);
1890 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1891 } else if (dst.regClass() == v1) {
1892 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1893 as_divergent_bool(ctx, src, true));
1894 } else {
1895 unreachable("Wrong destination register class for nir_op_b2f32.");
1896 }
1897 break;
1898 }
1899 case nir_op_b2f64: {
1900 Temp src = get_alu_src(ctx, instr->src[0]);
1901 if (dst.regClass() == s2) {
1902 src = as_uniform_bool(ctx, src);
1903 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1904 } else if (dst.regClass() == v2) {
1905 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1906 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1907 as_divergent_bool(ctx, src, true));
1908 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1909 } else {
1910 unreachable("Wrong destination register class for nir_op_b2f64.");
1911 }
1912 break;
1913 }
1914 case nir_op_i2i32: {
1915 Temp src = get_alu_src(ctx, instr->src[0]);
1916 if (instr->src[0].src.ssa->bit_size == 64) {
1917 /* we can actually just say dst = src, as it would map the lower register */
1918 emit_extract_vector(ctx, src, 0, dst);
1919 } else {
1920 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1921 nir_print_instr(&instr->instr, stderr);
1922 fprintf(stderr, "\n");
1923 }
1924 break;
1925 }
1926 case nir_op_u2u32: {
1927 Temp src = get_alu_src(ctx, instr->src[0]);
1928 if (instr->src[0].src.ssa->bit_size == 16) {
1929 if (dst.regClass() == s1) {
1930 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1931 } else {
1932 // TODO: do better with SDWA
1933 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1934 }
1935 } else if (instr->src[0].src.ssa->bit_size == 64) {
1936 /* we can actually just say dst = src, as it would map the lower register */
1937 emit_extract_vector(ctx, src, 0, dst);
1938 } else {
1939 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1940 nir_print_instr(&instr->instr, stderr);
1941 fprintf(stderr, "\n");
1942 }
1943 break;
1944 }
1945 case nir_op_i2i64: {
1946 Temp src = get_alu_src(ctx, instr->src[0]);
1947 if (instr->src[0].src.ssa->bit_size == 32) {
1948 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1949 } else {
1950 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1951 nir_print_instr(&instr->instr, stderr);
1952 fprintf(stderr, "\n");
1953 }
1954 break;
1955 }
1956 case nir_op_u2u64: {
1957 Temp src = get_alu_src(ctx, instr->src[0]);
1958 if (instr->src[0].src.ssa->bit_size == 32) {
1959 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1960 } else {
1961 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1962 nir_print_instr(&instr->instr, stderr);
1963 fprintf(stderr, "\n");
1964 }
1965 break;
1966 }
1967 case nir_op_b2i32: {
1968 Temp src = get_alu_src(ctx, instr->src[0]);
1969 if (dst.regClass() == s1) {
1970 if (src.regClass() == s1) {
1971 bld.copy(Definition(dst), src);
1972 } else {
1973 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1974 assert(src.regClass() == s2);
1975 bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1976 }
1977 } else {
1978 assert(dst.regClass() == v1 && src.regClass() == s2);
1979 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1980 }
1981 break;
1982 }
1983 case nir_op_i2b1: {
1984 Temp src = get_alu_src(ctx, instr->src[0]);
1985 if (dst.regClass() == s2) {
1986 assert(src.regClass() == v1 || src.regClass() == v2);
1987 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1988 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1989 } else {
1990 assert(src.regClass() == s1 && dst.regClass() == s1);
1991 bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
1992 }
1993 break;
1994 }
1995 case nir_op_pack_64_2x32_split: {
1996 Temp src0 = get_alu_src(ctx, instr->src[0]);
1997 Temp src1 = get_alu_src(ctx, instr->src[1]);
1998
1999 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2000 break;
2001 }
2002 case nir_op_unpack_64_2x32_split_x:
2003 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2004 break;
2005 case nir_op_unpack_64_2x32_split_y:
2006 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2007 break;
2008 case nir_op_pack_half_2x16: {
2009 Temp src = get_alu_src(ctx, instr->src[0], 2);
2010
2011 if (dst.regClass() == v1) {
2012 Temp src0 = bld.tmp(v1);
2013 Temp src1 = bld.tmp(v1);
2014 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2015 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2016
2017 } else {
2018 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2019 nir_print_instr(&instr->instr, stderr);
2020 fprintf(stderr, "\n");
2021 }
2022 break;
2023 }
2024 case nir_op_unpack_half_2x16_split_x: {
2025 if (dst.regClass() == v1) {
2026 Builder bld(ctx->program, ctx->block);
2027 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2028 } else {
2029 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2030 nir_print_instr(&instr->instr, stderr);
2031 fprintf(stderr, "\n");
2032 }
2033 break;
2034 }
2035 case nir_op_unpack_half_2x16_split_y: {
2036 if (dst.regClass() == v1) {
2037 Builder bld(ctx->program, ctx->block);
2038 /* TODO: use SDWA here */
2039 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2040 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2041 } else {
2042 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2043 nir_print_instr(&instr->instr, stderr);
2044 fprintf(stderr, "\n");
2045 }
2046 break;
2047 }
2048 case nir_op_fquantize2f16: {
2049 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2050
2051 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2052
2053 Temp cmp_res = bld.tmp(s2);
2054 bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2055
2056 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2057
2058 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2059 break;
2060 }
2061 case nir_op_bfm: {
2062 Temp bits = get_alu_src(ctx, instr->src[0]);
2063 Temp offset = get_alu_src(ctx, instr->src[1]);
2064
2065 if (dst.regClass() == s1) {
2066 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2067 } else if (dst.regClass() == v1) {
2068 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2069 } else {
2070 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2071 nir_print_instr(&instr->instr, stderr);
2072 fprintf(stderr, "\n");
2073 }
2074 break;
2075 }
2076 case nir_op_bitfield_select: {
2077 /* (mask & insert) | (~mask & base) */
2078 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2079 Temp insert = get_alu_src(ctx, instr->src[1]);
2080 Temp base = get_alu_src(ctx, instr->src[2]);
2081
2082 /* dst = (insert & bitmask) | (base & ~bitmask) */
2083 if (dst.regClass() == s1) {
2084 aco_ptr<Instruction> sop2;
2085 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2086 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2087 Operand lhs;
2088 if (const_insert && const_bitmask) {
2089 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2090 } else {
2091 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2092 lhs = Operand(insert);
2093 }
2094
2095 Operand rhs;
2096 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2097 if (const_base && const_bitmask) {
2098 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2099 } else {
2100 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2101 rhs = Operand(base);
2102 }
2103
2104 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2105
2106 } else if (dst.regClass() == v1) {
2107 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2108 base = as_vgpr(ctx, base);
2109 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2110 insert = as_vgpr(ctx, insert);
2111
2112 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2113
2114 } else {
2115 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2116 nir_print_instr(&instr->instr, stderr);
2117 fprintf(stderr, "\n");
2118 }
2119 break;
2120 }
2121 case nir_op_ubfe:
2122 case nir_op_ibfe: {
2123 Temp base = get_alu_src(ctx, instr->src[0]);
2124 Temp offset = get_alu_src(ctx, instr->src[1]);
2125 Temp bits = get_alu_src(ctx, instr->src[2]);
2126
2127 if (dst.type() == RegType::sgpr) {
2128 Operand extract;
2129 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2130 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2131 if (const_offset && const_bits) {
2132 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2133 extract = Operand(const_extract);
2134 } else {
2135 Operand width;
2136 if (const_bits) {
2137 width = Operand(const_bits->u32 << 16);
2138 } else {
2139 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2140 }
2141 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2142 }
2143
2144 aco_opcode opcode;
2145 if (dst.regClass() == s1) {
2146 if (instr->op == nir_op_ubfe)
2147 opcode = aco_opcode::s_bfe_u32;
2148 else
2149 opcode = aco_opcode::s_bfe_i32;
2150 } else if (dst.regClass() == s2) {
2151 if (instr->op == nir_op_ubfe)
2152 opcode = aco_opcode::s_bfe_u64;
2153 else
2154 opcode = aco_opcode::s_bfe_i64;
2155 } else {
2156 unreachable("Unsupported BFE bit size");
2157 }
2158
2159 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2160
2161 } else {
2162 aco_opcode opcode;
2163 if (dst.regClass() == v1) {
2164 if (instr->op == nir_op_ubfe)
2165 opcode = aco_opcode::v_bfe_u32;
2166 else
2167 opcode = aco_opcode::v_bfe_i32;
2168 } else {
2169 unreachable("Unsupported BFE bit size");
2170 }
2171
2172 emit_vop3a_instruction(ctx, instr, opcode, dst);
2173 }
2174 break;
2175 }
2176 case nir_op_bit_count: {
2177 Temp src = get_alu_src(ctx, instr->src[0]);
2178 if (src.regClass() == s1) {
2179 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2180 } else if (src.regClass() == v1) {
2181 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2182 } else if (src.regClass() == v2) {
2183 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2184 emit_extract_vector(ctx, src, 1, v1),
2185 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2186 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2187 } else if (src.regClass() == s2) {
2188 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2189 } else {
2190 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2191 nir_print_instr(&instr->instr, stderr);
2192 fprintf(stderr, "\n");
2193 }
2194 break;
2195 }
2196 case nir_op_flt: {
2197 if (instr->src[0].src.ssa->bit_size == 32)
2198 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2199 else if (instr->src[0].src.ssa->bit_size == 64)
2200 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2201 break;
2202 }
2203 case nir_op_fge: {
2204 if (instr->src[0].src.ssa->bit_size == 32)
2205 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2206 else if (instr->src[0].src.ssa->bit_size == 64)
2207 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2208 break;
2209 }
2210 case nir_op_feq: {
2211 if (instr->src[0].src.ssa->bit_size == 32)
2212 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2213 else if (instr->src[0].src.ssa->bit_size == 64)
2214 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2215 break;
2216 }
2217 case nir_op_fne: {
2218 if (instr->src[0].src.ssa->bit_size == 32)
2219 emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2220 else if (instr->src[0].src.ssa->bit_size == 64)
2221 emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2222 break;
2223 }
2224 case nir_op_ilt: {
2225 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2226 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2227 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2228 emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2229 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2230 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2231 break;
2232 }
2233 case nir_op_ige: {
2234 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2235 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2236 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2237 emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2238 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2239 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2240 break;
2241 }
2242 case nir_op_ieq: {
2243 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2244 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2245 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2246 emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2247 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2248 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2249 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2250 emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2251 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2252 Temp src0 = get_alu_src(ctx, instr->src[0]);
2253 Temp src1 = get_alu_src(ctx, instr->src[1]);
2254 bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2255 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2256 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2257 Temp src0 = get_alu_src(ctx, instr->src[0]);
2258 Temp src1 = get_alu_src(ctx, instr->src[1]);
2259 bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2260 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2261 } else {
2262 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2263 nir_print_instr(&instr->instr, stderr);
2264 fprintf(stderr, "\n");
2265 }
2266 break;
2267 }
2268 case nir_op_ine: {
2269 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2270 emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2271 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2272 emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2273 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2274 emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2275 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2276 emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2277 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2278 Temp src0 = get_alu_src(ctx, instr->src[0]);
2279 Temp src1 = get_alu_src(ctx, instr->src[1]);
2280 bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2281 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2282 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2283 Temp src0 = get_alu_src(ctx, instr->src[0]);
2284 Temp src1 = get_alu_src(ctx, instr->src[1]);
2285 bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2286 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2287 } else {
2288 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2289 nir_print_instr(&instr->instr, stderr);
2290 fprintf(stderr, "\n");
2291 }
2292 break;
2293 }
2294 case nir_op_ult: {
2295 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2296 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2297 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2298 emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2299 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2300 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2301 break;
2302 }
2303 case nir_op_uge: {
2304 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2305 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2306 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2307 emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2308 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2309 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2310 break;
2311 }
2312 case nir_op_fddx:
2313 case nir_op_fddy:
2314 case nir_op_fddx_fine:
2315 case nir_op_fddy_fine:
2316 case nir_op_fddx_coarse:
2317 case nir_op_fddy_coarse: {
2318 Definition tl = bld.def(v1);
2319 uint16_t dpp_ctrl;
2320 if (instr->op == nir_op_fddx_fine) {
2321 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2322 dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2323 } else if (instr->op == nir_op_fddy_fine) {
2324 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2325 dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2326 } else {
2327 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2328 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2329 dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2330 else
2331 dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2332 }
2333
2334 Definition tmp = bld.def(v1);
2335 bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2336 emit_wqm(ctx, tmp.getTemp(), dst, true);
2337 break;
2338 }
2339 default:
2340 fprintf(stderr, "Unknown NIR ALU instr: ");
2341 nir_print_instr(&instr->instr, stderr);
2342 fprintf(stderr, "\n");
2343 }
2344 }
2345
2346 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2347 {
2348 Temp dst = get_ssa_temp(ctx, &instr->def);
2349
2350 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2351 // which get truncated the lsb if double and msb if int
2352 // for now, we only use s_mov_b64 with 64bit inline constants
2353 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2354 assert(dst.type() == RegType::sgpr);
2355
2356 if (dst.size() == 1)
2357 {
2358 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2359 } else {
2360 assert(dst.size() != 1);
2361 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2362 if (instr->def.bit_size == 64)
2363 for (unsigned i = 0; i < dst.size(); i++)
2364 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2365 else {
2366 for (unsigned i = 0; i < dst.size(); i++)
2367 vec->operands[i] = Operand{instr->value[i].u32};
2368 }
2369 vec->definitions[0] = Definition(dst);
2370 ctx->block->instructions.emplace_back(std::move(vec));
2371 }
2372 }
2373
2374 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2375 {
2376 uint32_t new_mask = 0;
2377 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2378 if (mask & (1u << i))
2379 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2380 return new_mask;
2381 }
2382
2383 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2384 {
2385 /* This wouldn't work inside control flow or with indirect offsets but
2386 * that doesn't happen because of nir_lower_io_to_temporaries(). */
2387
2388 unsigned write_mask = nir_intrinsic_write_mask(instr);
2389 unsigned component = nir_intrinsic_component(instr);
2390 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2391 unsigned idx = nir_intrinsic_base(instr) + component;
2392
2393 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2394 if (off_instr->type != nir_instr_type_load_const) {
2395 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2396 nir_print_instr(off_instr, stderr);
2397 fprintf(stderr, "\n");
2398 }
2399 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2400
2401 if (instr->src[0].ssa->bit_size == 64)
2402 write_mask = widen_mask(write_mask, 2);
2403
2404 for (unsigned i = 0; i < 8; ++i) {
2405 if (write_mask & (1 << i)) {
2406 ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2407 ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2408 }
2409 idx++;
2410 }
2411 }
2412
2413 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2414 {
2415 unsigned write_mask = nir_intrinsic_write_mask(instr);
2416 Operand values[4];
2417 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2418 for (unsigned i = 0; i < 4; ++i) {
2419 if (write_mask & (1 << i)) {
2420 Temp tmp = emit_extract_vector(ctx, src, i, v1);
2421 values[i] = Operand(tmp);
2422 } else {
2423 values[i] = Operand(v1);
2424 }
2425 }
2426
2427 unsigned index = nir_intrinsic_base(instr) / 4;
2428 unsigned target, col_format;
2429 unsigned enabled_channels = 0xF;
2430 aco_opcode compr_op = (aco_opcode)0;
2431
2432 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2433 assert(offset && "Non-const offsets on exports not yet supported");
2434 index += offset->u32;
2435
2436 assert(index != FRAG_RESULT_COLOR);
2437
2438 /* Unlike vertex shader exports, it's fine to use multiple exports to
2439 * export separate channels of one target. So shaders which export both
2440 * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2441 * TODO: combine the exports in those cases and create better code
2442 */
2443
2444 if (index == FRAG_RESULT_SAMPLE_MASK) {
2445
2446 if (ctx->program->info->ps.writes_z) {
2447 target = V_008DFC_SQ_EXP_MRTZ;
2448 enabled_channels = 0x4;
2449 col_format = (unsigned) -1;
2450
2451 values[2] = values[0];
2452 values[0] = Operand(v1);
2453 } else {
2454 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2455 exp->valid_mask = false;
2456 exp->done = false;
2457 exp->compressed = true;
2458 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2459 exp->enabled_mask = 0xc;
2460 for (int i = 0; i < 4; i++)
2461 exp->operands[i] = Operand(v1);
2462 exp->operands[1] = Operand(values[0]);
2463 ctx->block->instructions.emplace_back(std::move(exp));
2464 return;
2465 }
2466
2467 } else if (index == FRAG_RESULT_DEPTH) {
2468
2469 target = V_008DFC_SQ_EXP_MRTZ;
2470 enabled_channels = 0x1;
2471 col_format = (unsigned) -1;
2472
2473 } else if (index == FRAG_RESULT_STENCIL) {
2474
2475 if (ctx->program->info->ps.writes_z) {
2476 target = V_008DFC_SQ_EXP_MRTZ;
2477 enabled_channels = 0x2;
2478 col_format = (unsigned) -1;
2479
2480 values[1] = values[0];
2481 values[0] = Operand(v1);
2482 } else {
2483 aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2484 shift->operands[0] = Operand((uint32_t) 16);
2485 shift->operands[1] = values[0];
2486 Temp tmp = {ctx->program->allocateId(), v1};
2487 shift->definitions[0] = Definition(tmp);
2488 ctx->block->instructions.emplace_back(std::move(shift));
2489
2490 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2491 exp->valid_mask = false;
2492 exp->done = false;
2493 exp->compressed = true;
2494 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2495 exp->enabled_mask = 0x3;
2496 exp->operands[0] = Operand(tmp);
2497 for (int i = 1; i < 4; i++)
2498 exp->operands[i] = Operand(v1);
2499 ctx->block->instructions.emplace_back(std::move(exp));
2500 return;
2501 }
2502
2503 } else {
2504 index -= FRAG_RESULT_DATA0;
2505 target = V_008DFC_SQ_EXP_MRT + index;
2506 col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2507 }
2508 ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2509 ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2510 assert(!is_int8 && !is_int10);
2511
2512 switch (col_format)
2513 {
2514 case V_028714_SPI_SHADER_ZERO:
2515 enabled_channels = 0; /* writemask */
2516 target = V_008DFC_SQ_EXP_NULL;
2517 break;
2518
2519 case V_028714_SPI_SHADER_32_R:
2520 enabled_channels = 1;
2521 break;
2522
2523 case V_028714_SPI_SHADER_32_GR:
2524 enabled_channels = 0x3;
2525 break;
2526
2527 case V_028714_SPI_SHADER_32_AR:
2528 if (ctx->options->chip_class >= GFX10) {
2529 /* Special case: on GFX10, the outputs are different for 32_AR */
2530 enabled_channels = 0x3;
2531 values[1] = values[3];
2532 } else {
2533 enabled_channels = 0x9;
2534 }
2535 break;
2536
2537 case V_028714_SPI_SHADER_FP16_ABGR:
2538 enabled_channels = 0x5;
2539 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2540 break;
2541
2542 case V_028714_SPI_SHADER_UNORM16_ABGR:
2543 enabled_channels = 0x5;
2544 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2545 break;
2546
2547 case V_028714_SPI_SHADER_SNORM16_ABGR:
2548 enabled_channels = 0x5;
2549 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2550 break;
2551
2552 case V_028714_SPI_SHADER_UINT16_ABGR:
2553 enabled_channels = 0x5;
2554 compr_op = aco_opcode::v_cvt_pk_u16_u32;
2555 break;
2556
2557 case V_028714_SPI_SHADER_SINT16_ABGR:
2558 enabled_channels = 0x5;
2559 compr_op = aco_opcode::v_cvt_pk_i16_i32;
2560 break;
2561
2562 case V_028714_SPI_SHADER_32_ABGR:
2563 enabled_channels = 0xF;
2564 break;
2565
2566 default:
2567 break;
2568 }
2569
2570 if (target == V_008DFC_SQ_EXP_NULL)
2571 return;
2572
2573 if ((bool)compr_op)
2574 {
2575 for (int i = 0; i < 2; i++)
2576 {
2577 /* check if at least one of the values to be compressed is enabled */
2578 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2579 if (enabled) {
2580 enabled_channels |= enabled << (i*2);
2581 aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2582 Temp tmp{ctx->program->allocateId(), v1};
2583 compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2584 compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2585 compr->definitions[0] = Definition(tmp);
2586 values[i] = Operand(tmp);
2587 ctx->block->instructions.emplace_back(std::move(compr));
2588 } else {
2589 values[i] = Operand(v1);
2590 }
2591 }
2592 }
2593
2594 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2595 exp->valid_mask = false;
2596 exp->done = false;
2597 exp->compressed = (bool) compr_op;
2598 exp->dest = target;
2599 exp->enabled_mask = enabled_channels;
2600 if ((bool) compr_op) {
2601 for (int i = 0; i < 2; i++)
2602 exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2603 exp->operands[2] = Operand(v1);
2604 exp->operands[3] = Operand(v1);
2605 } else {
2606 for (int i = 0; i < 4; i++)
2607 exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2608 }
2609
2610 ctx->block->instructions.emplace_back(std::move(exp));
2611 }
2612
2613 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2614 {
2615 if (ctx->stage == vertex_vs) {
2616 visit_store_vs_output(ctx, instr);
2617 } else if (ctx->stage == fragment_fs) {
2618 visit_store_fs_output(ctx, instr);
2619 } else {
2620 unreachable("Shader stage not implemented");
2621 }
2622 }
2623
2624 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2625 {
2626 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2627 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2628
2629 Builder bld(ctx->program, ctx->block);
2630 Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2631 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2632 }
2633
2634 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2635 {
2636 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2637 for (unsigned i = 0; i < num_components; i++)
2638 vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2639
2640 if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2641 assert(num_components == 4);
2642 Builder bld(ctx->program, ctx->block);
2643 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2644 }
2645
2646 for (Operand& op : vec->operands)
2647 op = op.isUndefined() ? Operand(0u) : op;
2648
2649 vec->definitions[0] = Definition(dst);
2650 ctx->block->instructions.emplace_back(std::move(vec));
2651 emit_split_vector(ctx, dst, num_components);
2652 return;
2653 }
2654
2655 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2656 {
2657 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2658 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2659 unsigned idx = nir_intrinsic_base(instr);
2660 unsigned component = nir_intrinsic_component(instr);
2661 Temp prim_mask = ctx->prim_mask;
2662
2663 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2664 if (offset) {
2665 assert(offset->u32 == 0);
2666 } else {
2667 /* the lower 15bit of the prim_mask contain the offset into LDS
2668 * while the upper bits contain the number of prims */
2669 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2670 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2671 Builder bld(ctx->program, ctx->block);
2672 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2673 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2674 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2675 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2676 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2677 }
2678
2679 if (instr->dest.ssa.num_components == 1) {
2680 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2681 } else {
2682 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2683 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2684 {
2685 Temp tmp = {ctx->program->allocateId(), v1};
2686 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2687 vec->operands[i] = Operand(tmp);
2688 }
2689 vec->definitions[0] = Definition(dst);
2690 ctx->block->instructions.emplace_back(std::move(vec));
2691 }
2692 }
2693
2694 unsigned get_num_channels_from_data_format(unsigned data_format)
2695 {
2696 switch (data_format) {
2697 case V_008F0C_BUF_DATA_FORMAT_8:
2698 case V_008F0C_BUF_DATA_FORMAT_16:
2699 case V_008F0C_BUF_DATA_FORMAT_32:
2700 return 1;
2701 case V_008F0C_BUF_DATA_FORMAT_8_8:
2702 case V_008F0C_BUF_DATA_FORMAT_16_16:
2703 case V_008F0C_BUF_DATA_FORMAT_32_32:
2704 return 2;
2705 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2706 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2707 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2708 return 3;
2709 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2710 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2711 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2712 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2713 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2714 return 4;
2715 default:
2716 break;
2717 }
2718
2719 return 4;
2720 }
2721
2722 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2723 * so we may need to fix it up. */
2724 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2725 {
2726 Builder bld(ctx->program, ctx->block);
2727
2728 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2729 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2730
2731 /* For the integer-like cases, do a natural sign extension.
2732 *
2733 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2734 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2735 * exponent.
2736 */
2737 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2738 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2739
2740 /* Convert back to the right type. */
2741 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2742 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2743 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2744 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2745 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2746 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2747 }
2748
2749 return alpha;
2750 }
2751
2752 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2753 {
2754 Builder bld(ctx->program, ctx->block);
2755 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2756 if (ctx->stage & sw_vs) {
2757
2758 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2759 if (off_instr->type != nir_instr_type_load_const) {
2760 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2761 nir_print_instr(off_instr, stderr);
2762 fprintf(stderr, "\n");
2763 }
2764 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
2765
2766 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
2767
2768 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
2769 unsigned component = nir_intrinsic_component(instr);
2770 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
2771 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
2772 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
2773 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
2774
2775 unsigned dfmt = attrib_format & 0xf;
2776
2777 unsigned nfmt = (attrib_format >> 4) & 0x7;
2778 unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
2779 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
2780 unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
2781 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
2782 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
2783 if (post_shuffle)
2784 num_channels = MAX2(num_channels, 3);
2785
2786 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
2787
2788 Temp index;
2789 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
2790 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
2791 if (divisor) {
2792 ctx->needs_instance_id = true;
2793
2794 if (divisor != 1) {
2795 Temp divided = bld.tmp(v1);
2796 emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
2797 index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
2798 } else {
2799 index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
2800 }
2801 } else {
2802 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
2803 }
2804 } else {
2805 index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
2806 }
2807
2808 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2809 index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
2810 attrib_offset = attrib_offset % attrib_stride;
2811 }
2812
2813 Operand soffset(0u);
2814 if (attrib_offset >= 4096) {
2815 soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
2816 attrib_offset = 0;
2817 }
2818
2819 aco_opcode opcode;
2820 switch (num_channels) {
2821 case 1:
2822 opcode = aco_opcode::tbuffer_load_format_x;
2823 break;
2824 case 2:
2825 opcode = aco_opcode::tbuffer_load_format_xy;
2826 break;
2827 case 3:
2828 opcode = aco_opcode::tbuffer_load_format_xyz;
2829 break;
2830 case 4:
2831 opcode = aco_opcode::tbuffer_load_format_xyzw;
2832 break;
2833 default:
2834 unreachable("Unimplemented load_input vector size");
2835 }
2836
2837 Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
2838
2839 aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
2840 mubuf->operands[0] = Operand(index);
2841 mubuf->operands[1] = Operand(list);
2842 mubuf->operands[2] = soffset;
2843 mubuf->definitions[0] = Definition(tmp);
2844 mubuf->idxen = true;
2845 mubuf->can_reorder = true;
2846 mubuf->dfmt = dfmt;
2847 mubuf->nfmt = nfmt;
2848 assert(attrib_offset < 4096);
2849 mubuf->offset = attrib_offset;
2850 ctx->block->instructions.emplace_back(std::move(mubuf));
2851
2852 emit_split_vector(ctx, tmp, tmp.size());
2853
2854 if (tmp.id() != dst.id()) {
2855 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
2856 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
2857
2858 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
2859 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
2860 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
2861
2862 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2863 for (unsigned i = 0; i < dst.size(); i++) {
2864 unsigned idx = i + component;
2865 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
2866 Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
2867 vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
2868 } else if (idx < num_channels) {
2869 vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
2870 } else if (is_float && idx == 3) {
2871 vec->operands[i] = Operand(0x3f800000u);
2872 } else if (!is_float && idx == 3) {
2873 vec->operands[i] = Operand(1u);
2874 } else {
2875 vec->operands[i] = Operand(0u);
2876 }
2877 }
2878 vec->definitions[0] = Definition(dst);
2879 ctx->block->instructions.emplace_back(std::move(vec));
2880 emit_split_vector(ctx, dst, dst.size());
2881 }
2882
2883 } else if (ctx->stage == fragment_fs) {
2884 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2885 if (off_instr->type != nir_instr_type_load_const ||
2886 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
2887 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2888 nir_print_instr(off_instr, stderr);
2889 fprintf(stderr, "\n");
2890 }
2891
2892 Temp prim_mask = ctx->prim_mask;
2893 nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
2894 if (offset) {
2895 assert(offset->u32 == 0);
2896 } else {
2897 /* the lower 15bit of the prim_mask contain the offset into LDS
2898 * while the upper bits contain the number of prims */
2899 Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
2900 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2901 Builder bld(ctx->program, ctx->block);
2902 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2903 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2904 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2905 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2906 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2907 }
2908
2909 unsigned idx = nir_intrinsic_base(instr);
2910 unsigned component = nir_intrinsic_component(instr);
2911
2912 if (dst.size() == 1) {
2913 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
2914 } else {
2915 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2916 for (unsigned i = 0; i < dst.size(); i++)
2917 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
2918 vec->definitions[0] = Definition(dst);
2919 bld.insert(std::move(vec));
2920 }
2921
2922 } else {
2923 unreachable("Shader stage not implemented");
2924 }
2925 }
2926
2927 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
2928 {
2929 if (ctx->program->info->need_indirect_descriptor_sets) {
2930 Builder bld(ctx->program, ctx->block);
2931 Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
2932 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
2933 }
2934
2935 return ctx->descriptor_sets[desc_set];
2936 }
2937
2938
2939 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
2940 {
2941 Builder bld(ctx->program, ctx->block);
2942 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
2943 if (!ctx->divergent_vals[instr->dest.ssa.index])
2944 index = bld.as_uniform(index);
2945 unsigned desc_set = nir_intrinsic_desc_set(instr);
2946 unsigned binding = nir_intrinsic_binding(instr);
2947
2948 Temp desc_ptr;
2949 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
2950 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
2951 unsigned offset = layout->binding[binding].offset;
2952 unsigned stride;
2953 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
2954 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
2955 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
2956 desc_ptr = ctx->push_constants;
2957 offset = pipeline_layout->push_constant_size + 16 * idx;
2958 stride = 16;
2959 } else {
2960 desc_ptr = load_desc_ptr(ctx, desc_set);
2961 stride = layout->binding[binding].size;
2962 }
2963
2964 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
2965 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
2966 if (stride != 1) {
2967 if (nir_const_index) {
2968 const_index = const_index * stride;
2969 } else if (index.type() == RegType::vgpr) {
2970 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
2971 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
2972 } else {
2973 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
2974 }
2975 }
2976 if (offset) {
2977 if (nir_const_index) {
2978 const_index = const_index + offset;
2979 } else if (index.type() == RegType::vgpr) {
2980 index = bld.vadd32(bld.def(v1), Operand(offset), index);
2981 } else {
2982 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
2983 }
2984 }
2985
2986 if (nir_const_index && const_index == 0) {
2987 index = desc_ptr;
2988 } else if (index.type() == RegType::vgpr) {
2989 index = bld.vadd32(bld.def(v1),
2990 nir_const_index ? Operand(const_index) : Operand(index),
2991 Operand(desc_ptr));
2992 } else {
2993 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
2994 nir_const_index ? Operand(const_index) : Operand(index),
2995 Operand(desc_ptr));
2996 }
2997
2998 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
2999 }
3000
3001 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
3002 {
3003 Builder bld(ctx->program, ctx->block);
3004
3005 unsigned num_bytes = dst.size() * 4;
3006 bool dlc = glc && ctx->options->chip_class >= GFX10;
3007
3008 aco_opcode op;
3009 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3010 if (ctx->options->chip_class < GFX8)
3011 offset = as_vgpr(ctx, offset);
3012
3013 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3014 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3015 unsigned const_offset = 0;
3016
3017 Temp lower = Temp();
3018 if (num_bytes > 16) {
3019 assert(num_components == 3 || num_components == 4);
3020 op = aco_opcode::buffer_load_dwordx4;
3021 lower = bld.tmp(v4);
3022 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3023 mubuf->definitions[0] = Definition(lower);
3024 mubuf->operands[0] = vaddr;
3025 mubuf->operands[1] = Operand(rsrc);
3026 mubuf->operands[2] = soffset;
3027 mubuf->offen = (offset.type() == RegType::vgpr);
3028 mubuf->glc = glc;
3029 mubuf->dlc = dlc;
3030 mubuf->barrier = barrier_buffer;
3031 bld.insert(std::move(mubuf));
3032 emit_split_vector(ctx, lower, 2);
3033 num_bytes -= 16;
3034 const_offset = 16;
3035 }
3036
3037 switch (num_bytes) {
3038 case 4:
3039 op = aco_opcode::buffer_load_dword;
3040 break;
3041 case 8:
3042 op = aco_opcode::buffer_load_dwordx2;
3043 break;
3044 case 12:
3045 op = aco_opcode::buffer_load_dwordx3;
3046 break;
3047 case 16:
3048 op = aco_opcode::buffer_load_dwordx4;
3049 break;
3050 default:
3051 unreachable("Load SSBO not implemented for this size.");
3052 }
3053 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3054 mubuf->operands[0] = vaddr;
3055 mubuf->operands[1] = Operand(rsrc);
3056 mubuf->operands[2] = soffset;
3057 mubuf->offen = (offset.type() == RegType::vgpr);
3058 mubuf->glc = glc;
3059 mubuf->dlc = dlc;
3060 mubuf->barrier = barrier_buffer;
3061 mubuf->offset = const_offset;
3062 aco_ptr<Instruction> instr = std::move(mubuf);
3063
3064 if (dst.size() > 4) {
3065 assert(lower != Temp());
3066 Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3067 instr->definitions[0] = Definition(upper);
3068 bld.insert(std::move(instr));
3069 if (dst.size() == 8)
3070 emit_split_vector(ctx, upper, 2);
3071 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3072 instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3073 instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3074 instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3075 if (dst.size() == 8)
3076 instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3077 }
3078
3079 if (dst.type() == RegType::sgpr) {
3080 Temp vec = bld.tmp(RegType::vgpr, dst.size());
3081 instr->definitions[0] = Definition(vec);
3082 bld.insert(std::move(instr));
3083 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3084 } else {
3085 instr->definitions[0] = Definition(dst);
3086 bld.insert(std::move(instr));
3087 }
3088 } else {
3089 switch (num_bytes) {
3090 case 4:
3091 op = aco_opcode::s_buffer_load_dword;
3092 break;
3093 case 8:
3094 op = aco_opcode::s_buffer_load_dwordx2;
3095 break;
3096 case 12:
3097 case 16:
3098 op = aco_opcode::s_buffer_load_dwordx4;
3099 break;
3100 case 24:
3101 case 32:
3102 op = aco_opcode::s_buffer_load_dwordx8;
3103 break;
3104 default:
3105 unreachable("Load SSBO not implemented for this size.");
3106 }
3107 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3108 load->operands[0] = Operand(rsrc);
3109 load->operands[1] = Operand(bld.as_uniform(offset));
3110 assert(load->operands[1].getTemp().type() == RegType::sgpr);
3111 load->definitions[0] = Definition(dst);
3112 load->glc = glc;
3113 load->dlc = dlc;
3114 load->barrier = barrier_buffer;
3115 assert(ctx->options->chip_class >= GFX8 || !glc);
3116
3117 /* trim vector */
3118 if (dst.size() == 3) {
3119 Temp vec = bld.tmp(s4);
3120 load->definitions[0] = Definition(vec);
3121 bld.insert(std::move(load));
3122 emit_split_vector(ctx, vec, 4);
3123
3124 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3125 emit_extract_vector(ctx, vec, 0, s1),
3126 emit_extract_vector(ctx, vec, 1, s1),
3127 emit_extract_vector(ctx, vec, 2, s1));
3128 } else if (dst.size() == 6) {
3129 Temp vec = bld.tmp(s8);
3130 load->definitions[0] = Definition(vec);
3131 bld.insert(std::move(load));
3132 emit_split_vector(ctx, vec, 4);
3133
3134 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3135 emit_extract_vector(ctx, vec, 0, s2),
3136 emit_extract_vector(ctx, vec, 1, s2),
3137 emit_extract_vector(ctx, vec, 2, s2));
3138 } else {
3139 bld.insert(std::move(load));
3140 }
3141
3142 }
3143 emit_split_vector(ctx, dst, num_components);
3144 }
3145
3146 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3147 {
3148 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3149 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3150
3151 Builder bld(ctx->program, ctx->block);
3152
3153 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3154 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3155 unsigned binding = nir_intrinsic_binding(idx_instr);
3156 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3157
3158 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3159 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3160 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3161 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3162 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3163 if (ctx->options->chip_class >= GFX10) {
3164 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3165 S_008F0C_OOB_SELECT(3) |
3166 S_008F0C_RESOURCE_LEVEL(1);
3167 } else {
3168 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3169 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3170 }
3171 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3172 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3173 Operand(0xFFFFFFFFu),
3174 Operand(desc_type));
3175 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3176 rsrc, upper_dwords);
3177 } else {
3178 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3179 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3180 }
3181
3182 load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3183 }
3184
3185 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3186 {
3187 Builder bld(ctx->program, ctx->block);
3188 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3189
3190 unsigned offset = nir_intrinsic_base(instr);
3191 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3192 if (index_cv && instr->dest.ssa.bit_size == 32) {
3193
3194 unsigned count = instr->dest.ssa.num_components;
3195 unsigned start = (offset + index_cv->u32) / 4u;
3196 start -= ctx->base_inline_push_consts;
3197 if (start + count <= ctx->num_inline_push_consts) {
3198 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3199 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3200 for (unsigned i = 0; i < count; ++i) {
3201 elems[i] = ctx->inline_push_consts[start + i];
3202 vec->operands[i] = Operand{elems[i]};
3203 }
3204 vec->definitions[0] = Definition(dst);
3205 ctx->block->instructions.emplace_back(std::move(vec));
3206 ctx->allocated_vec.emplace(dst.id(), elems);
3207 return;
3208 }
3209 }
3210
3211 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3212 if (offset != 0) // TODO check if index != 0 as well
3213 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3214 Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3215 Temp vec = dst;
3216 bool trim = false;
3217 aco_opcode op;
3218
3219 switch (dst.size()) {
3220 case 1:
3221 op = aco_opcode::s_load_dword;
3222 break;
3223 case 2:
3224 op = aco_opcode::s_load_dwordx2;
3225 break;
3226 case 3:
3227 vec = bld.tmp(s4);
3228 trim = true;
3229 case 4:
3230 op = aco_opcode::s_load_dwordx4;
3231 break;
3232 case 6:
3233 vec = bld.tmp(s8);
3234 trim = true;
3235 case 8:
3236 op = aco_opcode::s_load_dwordx8;
3237 break;
3238 default:
3239 unreachable("unimplemented or forbidden load_push_constant.");
3240 }
3241
3242 bld.smem(op, Definition(vec), ptr, index);
3243
3244 if (trim) {
3245 emit_split_vector(ctx, vec, 4);
3246 RegClass rc = dst.size() == 3 ? s1 : s2;
3247 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3248 emit_extract_vector(ctx, vec, 0, rc),
3249 emit_extract_vector(ctx, vec, 1, rc),
3250 emit_extract_vector(ctx, vec, 2, rc));
3251
3252 }
3253 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3254 }
3255
3256 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3257 {
3258 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3259
3260 Builder bld(ctx->program, ctx->block);
3261
3262 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3263 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3264 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3265 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3266 if (ctx->options->chip_class >= GFX10) {
3267 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3268 S_008F0C_OOB_SELECT(3) |
3269 S_008F0C_RESOURCE_LEVEL(1);
3270 } else {
3271 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3272 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3273 }
3274
3275 unsigned base = nir_intrinsic_base(instr);
3276 unsigned range = nir_intrinsic_range(instr);
3277
3278 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3279 if (base && offset.type() == RegType::sgpr)
3280 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3281 else if (base && offset.type() == RegType::vgpr)
3282 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3283
3284 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3285 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3286 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3287 Operand(desc_type));
3288
3289 load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3290 }
3291
3292 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3293 {
3294 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3295 ctx->cf_info.exec_potentially_empty = true;
3296
3297 ctx->program->needs_exact = true;
3298
3299 // TODO: optimize uniform conditions
3300 Builder bld(ctx->program, ctx->block);
3301 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3302 src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3303 bld.pseudo(aco_opcode::p_discard_if, src);
3304 ctx->block->kind |= block_kind_uses_discard_if;
3305 return;
3306 }
3307
3308 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3309 {
3310 Builder bld(ctx->program, ctx->block);
3311
3312 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3313 ctx->cf_info.exec_potentially_empty = true;
3314
3315 bool divergent = ctx->cf_info.parent_if.is_divergent ||
3316 ctx->cf_info.parent_loop.has_divergent_continue;
3317
3318 if (ctx->block->loop_nest_depth &&
3319 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3320 /* we handle discards the same way as jump instructions */
3321 append_logical_end(ctx->block);
3322
3323 /* in loops, discard behaves like break */
3324 Block *linear_target = ctx->cf_info.parent_loop.exit;
3325 ctx->block->kind |= block_kind_discard;
3326
3327 if (!divergent) {
3328 /* uniform discard - loop ends here */
3329 assert(nir_instr_is_last(&instr->instr));
3330 ctx->block->kind |= block_kind_uniform;
3331 ctx->cf_info.has_branch = true;
3332 bld.branch(aco_opcode::p_branch);
3333 add_linear_edge(ctx->block->index, linear_target);
3334 return;
3335 }
3336
3337 /* we add a break right behind the discard() instructions */
3338 ctx->block->kind |= block_kind_break;
3339 unsigned idx = ctx->block->index;
3340
3341 /* remove critical edges from linear CFG */
3342 bld.branch(aco_opcode::p_branch);
3343 Block* break_block = ctx->program->create_and_insert_block();
3344 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3345 break_block->kind |= block_kind_uniform;
3346 add_linear_edge(idx, break_block);
3347 add_linear_edge(break_block->index, linear_target);
3348 bld.reset(break_block);
3349 bld.branch(aco_opcode::p_branch);
3350
3351 Block* continue_block = ctx->program->create_and_insert_block();
3352 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3353 add_linear_edge(idx, continue_block);
3354 append_logical_start(continue_block);
3355 ctx->block = continue_block;
3356
3357 return;
3358 }
3359
3360 /* it can currently happen that NIR doesn't remove the unreachable code */
3361 if (!nir_instr_is_last(&instr->instr)) {
3362 ctx->program->needs_exact = true;
3363 /* save exec somewhere temporarily so that it doesn't get
3364 * overwritten before the discard from outer exec masks */
3365 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3366 bld.pseudo(aco_opcode::p_discard_if, cond);
3367 ctx->block->kind |= block_kind_uses_discard_if;
3368 return;
3369 }
3370
3371 /* This condition is incorrect for uniformly branched discards in a loop
3372 * predicated by a divergent condition, but the above code catches that case
3373 * and the discard would end up turning into a discard_if.
3374 * For example:
3375 * if (divergent) {
3376 * while (...) {
3377 * if (uniform) {
3378 * discard;
3379 * }
3380 * }
3381 * }
3382 */
3383 if (!ctx->cf_info.parent_if.is_divergent) {
3384 /* program just ends here */
3385 ctx->block->kind |= block_kind_uniform;
3386 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3387 0 /* enabled mask */, 9 /* dest */,
3388 false /* compressed */, true/* done */, true /* valid mask */);
3389 bld.sopp(aco_opcode::s_endpgm);
3390 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3391 } else {
3392 ctx->block->kind |= block_kind_discard;
3393 /* branch and linear edge is added by visit_if() */
3394 }
3395 }
3396
3397 enum aco_descriptor_type {
3398 ACO_DESC_IMAGE,
3399 ACO_DESC_FMASK,
3400 ACO_DESC_SAMPLER,
3401 ACO_DESC_BUFFER,
3402 ACO_DESC_PLANE_0,
3403 ACO_DESC_PLANE_1,
3404 ACO_DESC_PLANE_2,
3405 };
3406
3407 static bool
3408 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3409 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3410 return false;
3411 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3412 return dim == ac_image_cube ||
3413 dim == ac_image_1darray ||
3414 dim == ac_image_2darray ||
3415 dim == ac_image_2darraymsaa;
3416 }
3417
3418 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3419 enum aco_descriptor_type desc_type,
3420 const nir_tex_instr *tex_instr, bool image, bool write)
3421 {
3422 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3423 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3424 if (it != ctx->tex_desc.end())
3425 return it->second;
3426 */
3427 Temp index = Temp();
3428 bool index_set = false;
3429 unsigned constant_index = 0;
3430 unsigned descriptor_set;
3431 unsigned base_index;
3432 Builder bld(ctx->program, ctx->block);
3433
3434 if (!deref_instr) {
3435 assert(tex_instr && !image);
3436 descriptor_set = 0;
3437 base_index = tex_instr->sampler_index;
3438 } else {
3439 while(deref_instr->deref_type != nir_deref_type_var) {
3440 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3441 if (!array_size)
3442 array_size = 1;
3443
3444 assert(deref_instr->deref_type == nir_deref_type_array);
3445 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3446 if (const_value) {
3447 constant_index += array_size * const_value->u32;
3448 } else {
3449 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3450 if (indirect.type() == RegType::vgpr)
3451 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3452
3453 if (array_size != 1)
3454 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3455
3456 if (!index_set) {
3457 index = indirect;
3458 index_set = true;
3459 } else {
3460 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3461 }
3462 }
3463
3464 deref_instr = nir_src_as_deref(deref_instr->parent);
3465 }
3466 descriptor_set = deref_instr->var->data.descriptor_set;
3467 base_index = deref_instr->var->data.binding;
3468 }
3469
3470 Temp list = load_desc_ptr(ctx, descriptor_set);
3471 list = convert_pointer_to_64_bit(ctx, list);
3472
3473 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3474 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3475 unsigned offset = binding->offset;
3476 unsigned stride = binding->size;
3477 aco_opcode opcode;
3478 RegClass type;
3479
3480 assert(base_index < layout->binding_count);
3481
3482 switch (desc_type) {
3483 case ACO_DESC_IMAGE:
3484 type = s8;
3485 opcode = aco_opcode::s_load_dwordx8;
3486 break;
3487 case ACO_DESC_FMASK:
3488 type = s8;
3489 opcode = aco_opcode::s_load_dwordx8;
3490 offset += 32;
3491 break;
3492 case ACO_DESC_SAMPLER:
3493 type = s4;
3494 opcode = aco_opcode::s_load_dwordx4;
3495 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3496 offset += radv_combined_image_descriptor_sampler_offset(binding);
3497 break;
3498 case ACO_DESC_BUFFER:
3499 type = s4;
3500 opcode = aco_opcode::s_load_dwordx4;
3501 break;
3502 case ACO_DESC_PLANE_0:
3503 case ACO_DESC_PLANE_1:
3504 type = s8;
3505 opcode = aco_opcode::s_load_dwordx8;
3506 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3507 break;
3508 case ACO_DESC_PLANE_2:
3509 type = s4;
3510 opcode = aco_opcode::s_load_dwordx4;
3511 offset += 64;
3512 break;
3513 default:
3514 unreachable("invalid desc_type\n");
3515 }
3516
3517 offset += constant_index * stride;
3518
3519 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3520 (!index_set || binding->immutable_samplers_equal)) {
3521 if (binding->immutable_samplers_equal)
3522 constant_index = 0;
3523
3524 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3525 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3526 Operand(samplers[constant_index * 4 + 0]),
3527 Operand(samplers[constant_index * 4 + 1]),
3528 Operand(samplers[constant_index * 4 + 2]),
3529 Operand(samplers[constant_index * 4 + 3]));
3530 }
3531
3532 Operand off;
3533 if (!index_set) {
3534 off = Operand(offset);
3535 } else {
3536 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3537 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3538 }
3539
3540 Temp res = bld.smem(opcode, bld.def(type), list, off);
3541
3542 if (desc_type == ACO_DESC_PLANE_2) {
3543 Temp components[8];
3544 for (unsigned i = 0; i < 8; i++)
3545 components[i] = bld.tmp(s1);
3546 bld.pseudo(aco_opcode::p_split_vector,
3547 Definition(components[0]),
3548 Definition(components[1]),
3549 Definition(components[2]),
3550 Definition(components[3]),
3551 res);
3552
3553 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3554 bld.pseudo(aco_opcode::p_split_vector,
3555 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3556 Definition(components[4]),
3557 Definition(components[5]),
3558 Definition(components[6]),
3559 Definition(components[7]),
3560 desc2);
3561
3562 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3563 components[0], components[1], components[2], components[3],
3564 components[4], components[5], components[6], components[7]);
3565 }
3566
3567 return res;
3568 }
3569
3570 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3571 {
3572 switch (dim) {
3573 case GLSL_SAMPLER_DIM_BUF:
3574 return 1;
3575 case GLSL_SAMPLER_DIM_1D:
3576 return array ? 2 : 1;
3577 case GLSL_SAMPLER_DIM_2D:
3578 return array ? 3 : 2;
3579 case GLSL_SAMPLER_DIM_MS:
3580 return array ? 4 : 3;
3581 case GLSL_SAMPLER_DIM_3D:
3582 case GLSL_SAMPLER_DIM_CUBE:
3583 return 3;
3584 case GLSL_SAMPLER_DIM_RECT:
3585 case GLSL_SAMPLER_DIM_SUBPASS:
3586 return 2;
3587 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3588 return 3;
3589 default:
3590 break;
3591 }
3592 return 0;
3593 }
3594
3595
3596 /* Adjust the sample index according to FMASK.
3597 *
3598 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3599 * which is the identity mapping. Each nibble says which physical sample
3600 * should be fetched to get that sample.
3601 *
3602 * For example, 0x11111100 means there are only 2 samples stored and
3603 * the second sample covers 3/4 of the pixel. When reading samples 0
3604 * and 1, return physical sample 0 (determined by the first two 0s
3605 * in FMASK), otherwise return physical sample 1.
3606 *
3607 * The sample index should be adjusted as follows:
3608 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
3609 */
3610 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3611 {
3612 Builder bld(ctx->program, ctx->block);
3613 Temp fmask = bld.tmp(v1);
3614 unsigned dim = ctx->options->chip_class >= GFX10
3615 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3616 : 0;
3617
3618 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3619 load->operands[0] = Operand(coords);
3620 load->operands[1] = Operand(fmask_desc_ptr);
3621 load->definitions[0] = Definition(fmask);
3622 load->glc = false;
3623 load->dlc = false;
3624 load->dmask = 0x1;
3625 load->unrm = true;
3626 load->da = da;
3627 load->dim = dim;
3628 load->can_reorder = true; /* fmask images shouldn't be modified */
3629 ctx->block->instructions.emplace_back(std::move(load));
3630
3631 Operand sample_index4;
3632 if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3633 sample_index4 = Operand(sample_index.constantValue() << 2);
3634 } else if (sample_index.regClass() == s1) {
3635 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3636 } else {
3637 assert(sample_index.regClass() == v1);
3638 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3639 }
3640
3641 Temp final_sample;
3642 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3643 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3644 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3645 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3646 else
3647 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3648
3649 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3650 * resource descriptor is 0 (invalid),
3651 */
3652 Temp compare = bld.tmp(s2);
3653 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3654 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3655
3656 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3657
3658 /* Replace the MSAA sample index. */
3659 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3660 }
3661
3662 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3663 {
3664
3665 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3666 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3667 bool is_array = glsl_sampler_type_is_array(type);
3668 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3669 assert(!add_frag_pos && "Input attachments should be lowered.");
3670 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3671 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3672 int count = image_type_to_components_count(dim, is_array);
3673 std::vector<Operand> coords(count);
3674
3675 if (is_ms) {
3676 Operand sample_index;
3677 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3678 if (sample_cv)
3679 sample_index = Operand(sample_cv->u32);
3680 else
3681 sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3682
3683 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3684 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3685 for (unsigned i = 0; i < vec->operands.size(); i++)
3686 vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3687 Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3688 vec->definitions[0] = Definition(fmask_load_address);
3689 ctx->block->instructions.emplace_back(std::move(vec));
3690
3691 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3692 sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3693 }
3694 count--;
3695 coords[count] = sample_index;
3696 }
3697
3698 if (count == 1 && !gfx9_1d)
3699 return emit_extract_vector(ctx, src0, 0, v1);
3700
3701 if (gfx9_1d) {
3702 coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3703 coords.resize(coords.size() + 1);
3704 coords[1] = Operand((uint32_t) 0);
3705 if (is_array)
3706 coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3707 } else {
3708 for (int i = 0; i < count; i++)
3709 coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3710 }
3711
3712 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3713 for (unsigned i = 0; i < coords.size(); i++)
3714 vec->operands[i] = coords[i];
3715 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3716 vec->definitions[0] = Definition(res);
3717 ctx->block->instructions.emplace_back(std::move(vec));
3718 return res;
3719 }
3720
3721
3722 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3723 {
3724 Builder bld(ctx->program, ctx->block);
3725 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3726 const struct glsl_type *type = glsl_without_array(var->type);
3727 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3728 bool is_array = glsl_sampler_type_is_array(type);
3729 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3730
3731 if (dim == GLSL_SAMPLER_DIM_BUF) {
3732 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3733 unsigned num_channels = util_last_bit(mask);
3734 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3735 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3736
3737 aco_opcode opcode;
3738 switch (num_channels) {
3739 case 1:
3740 opcode = aco_opcode::buffer_load_format_x;
3741 break;
3742 case 2:
3743 opcode = aco_opcode::buffer_load_format_xy;
3744 break;
3745 case 3:
3746 opcode = aco_opcode::buffer_load_format_xyz;
3747 break;
3748 case 4:
3749 opcode = aco_opcode::buffer_load_format_xyzw;
3750 break;
3751 default:
3752 unreachable(">4 channel buffer image load");
3753 }
3754 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3755 load->operands[0] = Operand(vindex);
3756 load->operands[1] = Operand(rsrc);
3757 load->operands[2] = Operand((uint32_t) 0);
3758 Temp tmp;
3759 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3760 tmp = dst;
3761 else
3762 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
3763 load->definitions[0] = Definition(tmp);
3764 load->idxen = true;
3765 load->barrier = barrier_image;
3766 ctx->block->instructions.emplace_back(std::move(load));
3767
3768 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
3769 return;
3770 }
3771
3772 Temp coords = get_image_coords(ctx, instr, type);
3773 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3774
3775 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
3776 unsigned num_components = util_bitcount(dmask);
3777 Temp tmp;
3778 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3779 tmp = dst;
3780 else
3781 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
3782
3783 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3784 load->operands[0] = Operand(coords);
3785 load->operands[1] = Operand(resource);
3786 load->definitions[0] = Definition(tmp);
3787 load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
3788 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3789 load->dmask = dmask;
3790 load->unrm = true;
3791 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3792 load->barrier = barrier_image;
3793 ctx->block->instructions.emplace_back(std::move(load));
3794
3795 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
3796 return;
3797 }
3798
3799 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
3800 {
3801 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3802 const struct glsl_type *type = glsl_without_array(var->type);
3803 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3804 bool is_array = glsl_sampler_type_is_array(type);
3805 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3806
3807 bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
3808
3809 if (dim == GLSL_SAMPLER_DIM_BUF) {
3810 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3811 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3812 aco_opcode opcode;
3813 switch (data.size()) {
3814 case 1:
3815 opcode = aco_opcode::buffer_store_format_x;
3816 break;
3817 case 2:
3818 opcode = aco_opcode::buffer_store_format_xy;
3819 break;
3820 case 3:
3821 opcode = aco_opcode::buffer_store_format_xyz;
3822 break;
3823 case 4:
3824 opcode = aco_opcode::buffer_store_format_xyzw;
3825 break;
3826 default:
3827 unreachable(">4 channel buffer image store");
3828 }
3829 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
3830 store->operands[0] = Operand(vindex);
3831 store->operands[1] = Operand(rsrc);
3832 store->operands[2] = Operand((uint32_t) 0);
3833 store->operands[3] = Operand(data);
3834 store->idxen = true;
3835 store->glc = glc;
3836 store->dlc = false;
3837 store->disable_wqm = true;
3838 store->barrier = barrier_image;
3839 ctx->program->needs_exact = true;
3840 ctx->block->instructions.emplace_back(std::move(store));
3841 return;
3842 }
3843
3844 assert(data.type() == RegType::vgpr);
3845 Temp coords = get_image_coords(ctx, instr, type);
3846 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3847
3848 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
3849 store->operands[0] = Operand(coords);
3850 store->operands[1] = Operand(resource);
3851 store->operands[2] = Operand(s4);
3852 store->operands[3] = Operand(data);
3853 store->glc = glc;
3854 store->dlc = false;
3855 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3856 store->dmask = (1 << data.size()) - 1;
3857 store->unrm = true;
3858 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3859 store->disable_wqm = true;
3860 store->barrier = barrier_image;
3861 ctx->program->needs_exact = true;
3862 ctx->block->instructions.emplace_back(std::move(store));
3863 return;
3864 }
3865
3866 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
3867 {
3868 /* return the previous value if dest is ever used */
3869 bool return_previous = false;
3870 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
3871 return_previous = true;
3872 break;
3873 }
3874 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
3875 return_previous = true;
3876 break;
3877 }
3878
3879 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3880 const struct glsl_type *type = glsl_without_array(var->type);
3881 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3882 bool is_array = glsl_sampler_type_is_array(type);
3883 Builder bld(ctx->program, ctx->block);
3884
3885 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3886 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
3887
3888 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
3889 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
3890
3891 aco_opcode buf_op, image_op;
3892 switch (instr->intrinsic) {
3893 case nir_intrinsic_image_deref_atomic_add:
3894 buf_op = aco_opcode::buffer_atomic_add;
3895 image_op = aco_opcode::image_atomic_add;
3896 break;
3897 case nir_intrinsic_image_deref_atomic_umin:
3898 buf_op = aco_opcode::buffer_atomic_umin;
3899 image_op = aco_opcode::image_atomic_umin;
3900 break;
3901 case nir_intrinsic_image_deref_atomic_imin:
3902 buf_op = aco_opcode::buffer_atomic_smin;
3903 image_op = aco_opcode::image_atomic_smin;
3904 break;
3905 case nir_intrinsic_image_deref_atomic_umax:
3906 buf_op = aco_opcode::buffer_atomic_umax;
3907 image_op = aco_opcode::image_atomic_umax;
3908 break;
3909 case nir_intrinsic_image_deref_atomic_imax:
3910 buf_op = aco_opcode::buffer_atomic_smax;
3911 image_op = aco_opcode::image_atomic_smax;
3912 break;
3913 case nir_intrinsic_image_deref_atomic_and:
3914 buf_op = aco_opcode::buffer_atomic_and;
3915 image_op = aco_opcode::image_atomic_and;
3916 break;
3917 case nir_intrinsic_image_deref_atomic_or:
3918 buf_op = aco_opcode::buffer_atomic_or;
3919 image_op = aco_opcode::image_atomic_or;
3920 break;
3921 case nir_intrinsic_image_deref_atomic_xor:
3922 buf_op = aco_opcode::buffer_atomic_xor;
3923 image_op = aco_opcode::image_atomic_xor;
3924 break;
3925 case nir_intrinsic_image_deref_atomic_exchange:
3926 buf_op = aco_opcode::buffer_atomic_swap;
3927 image_op = aco_opcode::image_atomic_swap;
3928 break;
3929 case nir_intrinsic_image_deref_atomic_comp_swap:
3930 buf_op = aco_opcode::buffer_atomic_cmpswap;
3931 image_op = aco_opcode::image_atomic_cmpswap;
3932 break;
3933 default:
3934 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
3935 }
3936
3937 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3938
3939 if (dim == GLSL_SAMPLER_DIM_BUF) {
3940 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3941 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3942 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
3943 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
3944 mubuf->operands[0] = Operand(vindex);
3945 mubuf->operands[1] = Operand(resource);
3946 mubuf->operands[2] = Operand((uint32_t)0);
3947 mubuf->operands[3] = Operand(data);
3948 if (return_previous)
3949 mubuf->definitions[0] = Definition(dst);
3950 mubuf->offset = 0;
3951 mubuf->idxen = true;
3952 mubuf->glc = return_previous;
3953 mubuf->dlc = false; /* Not needed for atomics */
3954 mubuf->disable_wqm = true;
3955 mubuf->barrier = barrier_image;
3956 ctx->program->needs_exact = true;
3957 ctx->block->instructions.emplace_back(std::move(mubuf));
3958 return;
3959 }
3960
3961 Temp coords = get_image_coords(ctx, instr, type);
3962 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3963 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
3964 mimg->operands[0] = Operand(coords);
3965 mimg->operands[1] = Operand(resource);
3966 mimg->operands[2] = Operand(s4); /* no sampler */
3967 mimg->operands[3] = Operand(data);
3968 if (return_previous)
3969 mimg->definitions[0] = Definition(dst);
3970 mimg->glc = return_previous;
3971 mimg->dlc = false; /* Not needed for atomics */
3972 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3973 mimg->dmask = (1 << data.size()) - 1;
3974 mimg->unrm = true;
3975 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3976 mimg->disable_wqm = true;
3977 mimg->barrier = barrier_image;
3978 ctx->program->needs_exact = true;
3979 ctx->block->instructions.emplace_back(std::move(mimg));
3980 return;
3981 }
3982
3983 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
3984 {
3985 if (in_elements && ctx->options->chip_class == GFX8) {
3986 Builder bld(ctx->program, ctx->block);
3987
3988 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
3989 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
3990 stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
3991 stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
3992
3993 Temp size = emit_extract_vector(ctx, desc, 2, s1);
3994 size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
3995
3996 Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
3997 res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
3998 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
3999
4000 // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4001 /* idea
4002 * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4003 * in case 12 (or 3?), we have to divide by 3:
4004 * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4005 * use v_mul_hi_u32 with magic number to divide
4006 * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4007 * disable v_skip
4008 * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4009 */
4010
4011 } else {
4012 emit_extract_vector(ctx, desc, 2, dst);
4013 }
4014 }
4015
4016 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4017 {
4018 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4019 const struct glsl_type *type = glsl_without_array(var->type);
4020 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4021 bool is_array = glsl_sampler_type_is_array(type);
4022 Builder bld(ctx->program, ctx->block);
4023
4024 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4025 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4026 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4027 }
4028
4029 /* LOD */
4030 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4031
4032 /* Resource */
4033 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4034
4035 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4036
4037 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4038 mimg->operands[0] = Operand(lod);
4039 mimg->operands[1] = Operand(resource);
4040 unsigned& dmask = mimg->dmask;
4041 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4042 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4043 mimg->da = glsl_sampler_type_is_array(type);
4044 mimg->can_reorder = true;
4045 Definition& def = mimg->definitions[0];
4046 ctx->block->instructions.emplace_back(std::move(mimg));
4047
4048 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4049 glsl_sampler_type_is_array(type)) {
4050
4051 assert(instr->dest.ssa.num_components == 3);
4052 Temp tmp = {ctx->program->allocateId(), v3};
4053 def = Definition(tmp);
4054 emit_split_vector(ctx, tmp, 3);
4055
4056 /* divide 3rd value by 6 by multiplying with magic number */
4057 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4058 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4059
4060 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4061 emit_extract_vector(ctx, tmp, 0, v1),
4062 emit_extract_vector(ctx, tmp, 1, v1),
4063 by_6);
4064
4065 } else if (ctx->options->chip_class == GFX9 &&
4066 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4067 glsl_sampler_type_is_array(type)) {
4068 assert(instr->dest.ssa.num_components == 2);
4069 def = Definition(dst);
4070 dmask = 0x5;
4071 } else {
4072 def = Definition(dst);
4073 }
4074
4075 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4076 }
4077
4078 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4079 {
4080 Builder bld(ctx->program, ctx->block);
4081 unsigned num_components = instr->num_components;
4082
4083 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4084 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4085 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4086
4087 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4088 load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4089 }
4090
4091 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4092 {
4093 Builder bld(ctx->program, ctx->block);
4094 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4095 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4096 unsigned writemask = nir_intrinsic_write_mask(instr);
4097
4098 Temp offset;
4099 if (ctx->options->chip_class < GFX8)
4100 offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4101 else
4102 offset = get_ssa_temp(ctx, instr->src[2].ssa);
4103
4104 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4105 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4106
4107 bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4108 ctx->options->chip_class >= GFX8;
4109 if (smem)
4110 offset = bld.as_uniform(offset);
4111 bool smem_nonfs = smem && ctx->stage != fragment_fs;
4112
4113 while (writemask) {
4114 int start, count;
4115 u_bit_scan_consecutive_range(&writemask, &start, &count);
4116 if (count == 3 && smem) {
4117 writemask |= 1u << (start + 2);
4118 count = 2;
4119 }
4120 int num_bytes = count * elem_size_bytes;
4121
4122 if (num_bytes > 16) {
4123 assert(elem_size_bytes == 8);
4124 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4125 count = 2;
4126 num_bytes = 16;
4127 }
4128
4129 // TODO: check alignment of sub-dword stores
4130 // TODO: split 3 bytes. there is no store instruction for that
4131
4132 Temp write_data;
4133 if (count != instr->num_components) {
4134 emit_split_vector(ctx, data, instr->num_components);
4135 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4136 for (int i = 0; i < count; i++) {
4137 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4138 vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4139 }
4140 write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4141 vec->definitions[0] = Definition(write_data);
4142 ctx->block->instructions.emplace_back(std::move(vec));
4143 } else if (!smem && data.type() != RegType::vgpr) {
4144 assert(num_bytes % 4 == 0);
4145 write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4146 } else if (smem_nonfs && data.type() == RegType::vgpr) {
4147 assert(num_bytes % 4 == 0);
4148 write_data = bld.as_uniform(data);
4149 } else {
4150 write_data = data;
4151 }
4152
4153 aco_opcode vmem_op, smem_op;
4154 switch (num_bytes) {
4155 case 4:
4156 vmem_op = aco_opcode::buffer_store_dword;
4157 smem_op = aco_opcode::s_buffer_store_dword;
4158 break;
4159 case 8:
4160 vmem_op = aco_opcode::buffer_store_dwordx2;
4161 smem_op = aco_opcode::s_buffer_store_dwordx2;
4162 break;
4163 case 12:
4164 vmem_op = aco_opcode::buffer_store_dwordx3;
4165 smem_op = aco_opcode::last_opcode;
4166 assert(!smem);
4167 break;
4168 case 16:
4169 vmem_op = aco_opcode::buffer_store_dwordx4;
4170 smem_op = aco_opcode::s_buffer_store_dwordx4;
4171 break;
4172 default:
4173 unreachable("Store SSBO not implemented for this size.");
4174 }
4175 if (ctx->stage == fragment_fs)
4176 smem_op = aco_opcode::p_fs_buffer_store_smem;
4177
4178 if (smem) {
4179 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4180 store->operands[0] = Operand(rsrc);
4181 if (start) {
4182 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4183 offset, Operand(start * elem_size_bytes));
4184 store->operands[1] = Operand(off);
4185 } else {
4186 store->operands[1] = Operand(offset);
4187 }
4188 if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4189 store->operands[1].setFixed(m0);
4190 store->operands[2] = Operand(write_data);
4191 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4192 store->dlc = false;
4193 store->disable_wqm = true;
4194 store->barrier = barrier_buffer;
4195 ctx->block->instructions.emplace_back(std::move(store));
4196 ctx->program->wb_smem_l1_on_end = true;
4197 if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4198 ctx->block->kind |= block_kind_needs_lowering;
4199 ctx->program->needs_exact = true;
4200 }
4201 } else {
4202 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4203 store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4204 store->operands[1] = Operand(rsrc);
4205 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4206 store->operands[3] = Operand(write_data);
4207 store->offset = start * elem_size_bytes;
4208 store->offen = (offset.type() == RegType::vgpr);
4209 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4210 store->dlc = false;
4211 store->disable_wqm = true;
4212 store->barrier = barrier_buffer;
4213 ctx->program->needs_exact = true;
4214 ctx->block->instructions.emplace_back(std::move(store));
4215 }
4216 }
4217 }
4218
4219 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4220 {
4221 /* return the previous value if dest is ever used */
4222 bool return_previous = false;
4223 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4224 return_previous = true;
4225 break;
4226 }
4227 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4228 return_previous = true;
4229 break;
4230 }
4231
4232 Builder bld(ctx->program, ctx->block);
4233 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4234
4235 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4236 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4237 get_ssa_temp(ctx, instr->src[3].ssa), data);
4238
4239 Temp offset;
4240 if (ctx->options->chip_class < GFX8)
4241 offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4242 else
4243 offset = get_ssa_temp(ctx, instr->src[1].ssa);
4244
4245 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4246 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4247
4248 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4249
4250 aco_opcode op32, op64;
4251 switch (instr->intrinsic) {
4252 case nir_intrinsic_ssbo_atomic_add:
4253 op32 = aco_opcode::buffer_atomic_add;
4254 op64 = aco_opcode::buffer_atomic_add_x2;
4255 break;
4256 case nir_intrinsic_ssbo_atomic_imin:
4257 op32 = aco_opcode::buffer_atomic_smin;
4258 op64 = aco_opcode::buffer_atomic_smin_x2;
4259 break;
4260 case nir_intrinsic_ssbo_atomic_umin:
4261 op32 = aco_opcode::buffer_atomic_umin;
4262 op64 = aco_opcode::buffer_atomic_umin_x2;
4263 break;
4264 case nir_intrinsic_ssbo_atomic_imax:
4265 op32 = aco_opcode::buffer_atomic_smax;
4266 op64 = aco_opcode::buffer_atomic_smax_x2;
4267 break;
4268 case nir_intrinsic_ssbo_atomic_umax:
4269 op32 = aco_opcode::buffer_atomic_umax;
4270 op64 = aco_opcode::buffer_atomic_umax_x2;
4271 break;
4272 case nir_intrinsic_ssbo_atomic_and:
4273 op32 = aco_opcode::buffer_atomic_and;
4274 op64 = aco_opcode::buffer_atomic_and_x2;
4275 break;
4276 case nir_intrinsic_ssbo_atomic_or:
4277 op32 = aco_opcode::buffer_atomic_or;
4278 op64 = aco_opcode::buffer_atomic_or_x2;
4279 break;
4280 case nir_intrinsic_ssbo_atomic_xor:
4281 op32 = aco_opcode::buffer_atomic_xor;
4282 op64 = aco_opcode::buffer_atomic_xor_x2;
4283 break;
4284 case nir_intrinsic_ssbo_atomic_exchange:
4285 op32 = aco_opcode::buffer_atomic_swap;
4286 op64 = aco_opcode::buffer_atomic_swap_x2;
4287 break;
4288 case nir_intrinsic_ssbo_atomic_comp_swap:
4289 op32 = aco_opcode::buffer_atomic_cmpswap;
4290 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4291 break;
4292 default:
4293 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4294 }
4295 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4296 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4297 mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4298 mubuf->operands[1] = Operand(rsrc);
4299 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4300 mubuf->operands[3] = Operand(data);
4301 if (return_previous)
4302 mubuf->definitions[0] = Definition(dst);
4303 mubuf->offset = 0;
4304 mubuf->offen = (offset.type() == RegType::vgpr);
4305 mubuf->glc = return_previous;
4306 mubuf->dlc = false; /* Not needed for atomics */
4307 mubuf->disable_wqm = true;
4308 mubuf->barrier = barrier_buffer;
4309 ctx->program->needs_exact = true;
4310 ctx->block->instructions.emplace_back(std::move(mubuf));
4311 }
4312
4313 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4314
4315 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4316 Builder bld(ctx->program, ctx->block);
4317 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4318 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4319 }
4320
4321 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4322 {
4323 Builder bld(ctx->program, ctx->block);
4324 unsigned num_components = instr->num_components;
4325 unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4326
4327 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4328 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4329
4330 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4331 bool dlc = glc && ctx->options->chip_class >= GFX10;
4332 aco_opcode op;
4333 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4334 bool global = ctx->options->chip_class >= GFX9;
4335 aco_opcode op;
4336 switch (num_bytes) {
4337 case 4:
4338 op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4339 break;
4340 case 8:
4341 op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4342 break;
4343 case 12:
4344 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4345 break;
4346 case 16:
4347 op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4348 break;
4349 default:
4350 unreachable("load_global not implemented for this size.");
4351 }
4352 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4353 flat->operands[0] = Operand(addr);
4354 flat->operands[1] = Operand(s1);
4355 flat->glc = glc;
4356 flat->dlc = dlc;
4357
4358 if (dst.type() == RegType::sgpr) {
4359 Temp vec = bld.tmp(RegType::vgpr, dst.size());
4360 flat->definitions[0] = Definition(vec);
4361 ctx->block->instructions.emplace_back(std::move(flat));
4362 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4363 } else {
4364 flat->definitions[0] = Definition(dst);
4365 ctx->block->instructions.emplace_back(std::move(flat));
4366 }
4367 emit_split_vector(ctx, dst, num_components);
4368 } else {
4369 switch (num_bytes) {
4370 case 4:
4371 op = aco_opcode::s_load_dword;
4372 break;
4373 case 8:
4374 op = aco_opcode::s_load_dwordx2;
4375 break;
4376 case 12:
4377 case 16:
4378 op = aco_opcode::s_load_dwordx4;
4379 break;
4380 default:
4381 unreachable("load_global not implemented for this size.");
4382 }
4383 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4384 load->operands[0] = Operand(addr);
4385 load->operands[1] = Operand(0u);
4386 load->definitions[0] = Definition(dst);
4387 load->glc = glc;
4388 load->dlc = dlc;
4389 load->barrier = barrier_buffer;
4390 assert(ctx->options->chip_class >= GFX8 || !glc);
4391
4392 if (dst.size() == 3) {
4393 /* trim vector */
4394 Temp vec = bld.tmp(s4);
4395 load->definitions[0] = Definition(vec);
4396 ctx->block->instructions.emplace_back(std::move(load));
4397 emit_split_vector(ctx, vec, 4);
4398
4399 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4400 emit_extract_vector(ctx, vec, 0, s1),
4401 emit_extract_vector(ctx, vec, 1, s1),
4402 emit_extract_vector(ctx, vec, 2, s1));
4403 } else {
4404 ctx->block->instructions.emplace_back(std::move(load));
4405 }
4406 }
4407 }
4408
4409 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4410 {
4411 Builder bld(ctx->program, ctx->block);
4412 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4413
4414 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4415 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4416
4417 unsigned writemask = nir_intrinsic_write_mask(instr);
4418 while (writemask) {
4419 int start, count;
4420 u_bit_scan_consecutive_range(&writemask, &start, &count);
4421 unsigned num_bytes = count * elem_size_bytes;
4422
4423 Temp write_data = data;
4424 if (count != instr->num_components) {
4425 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4426 for (int i = 0; i < count; i++)
4427 vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4428 write_data = bld.tmp(RegType::vgpr, count);
4429 vec->definitions[0] = Definition(write_data);
4430 ctx->block->instructions.emplace_back(std::move(vec));
4431 }
4432
4433 unsigned offset = start * elem_size_bytes;
4434 if (offset > 0 && ctx->options->chip_class < GFX9) {
4435 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4436 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4437 Temp carry = bld.tmp(s2);
4438 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4439
4440 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4441 Operand(offset), addr0);
4442 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4443 Operand(0u), addr1,
4444 carry).def(1).setHint(vcc);
4445
4446 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4447
4448 offset = 0;
4449 }
4450
4451 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4452 bool global = ctx->options->chip_class >= GFX9;
4453 aco_opcode op;
4454 switch (num_bytes) {
4455 case 4:
4456 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4457 break;
4458 case 8:
4459 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4460 break;
4461 case 12:
4462 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4463 break;
4464 case 16:
4465 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4466 break;
4467 default:
4468 unreachable("store_global not implemented for this size.");
4469 }
4470 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4471 flat->operands[0] = Operand(addr);
4472 flat->operands[1] = Operand(s1);
4473 flat->operands[2] = Operand(data);
4474 flat->glc = glc;
4475 flat->dlc = false;
4476 flat->offset = offset;
4477 ctx->block->instructions.emplace_back(std::move(flat));
4478 }
4479 }
4480
4481 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4482 Builder bld(ctx->program, ctx->block);
4483 switch(instr->intrinsic) {
4484 case nir_intrinsic_group_memory_barrier:
4485 case nir_intrinsic_memory_barrier:
4486 bld.barrier(aco_opcode::p_memory_barrier_all);
4487 break;
4488 case nir_intrinsic_memory_barrier_atomic_counter:
4489 bld.barrier(aco_opcode::p_memory_barrier_atomic);
4490 break;
4491 case nir_intrinsic_memory_barrier_buffer:
4492 bld.barrier(aco_opcode::p_memory_barrier_buffer);
4493 break;
4494 case nir_intrinsic_memory_barrier_image:
4495 bld.barrier(aco_opcode::p_memory_barrier_image);
4496 break;
4497 case nir_intrinsic_memory_barrier_shared:
4498 bld.barrier(aco_opcode::p_memory_barrier_shared);
4499 break;
4500 default:
4501 unreachable("Unimplemented memory barrier intrinsic");
4502 break;
4503 }
4504 }
4505
4506 Operand load_lds_size_m0(isel_context *ctx)
4507 {
4508 /* TODO: m0 does not need to be initialized on GFX9+ */
4509 Builder bld(ctx->program, ctx->block);
4510 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
4511 }
4512
4513
4514 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4515 {
4516 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4517 Operand m = load_lds_size_m0(ctx);
4518 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4519 assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4520 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4521 Builder bld(ctx->program, ctx->block);
4522
4523 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4524 unsigned bytes_read = 0;
4525 unsigned result_size = 0;
4526 unsigned total_bytes = instr->num_components * elem_size_bytes;
4527 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8;
4528 std::array<Temp, 4> result;
4529
4530 while (bytes_read < total_bytes) {
4531 unsigned todo = total_bytes - bytes_read;
4532 bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
4533 bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
4534
4535 aco_opcode op = aco_opcode::last_opcode;
4536 if (todo >= 16 && aligned16) {
4537 op = aco_opcode::ds_read_b128;
4538 todo = 16;
4539 } else if (todo >= 12 && aligned16) {
4540 op = aco_opcode::ds_read_b96;
4541 todo = 12;
4542 } else if (todo >= 8) {
4543 op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32;
4544 todo = 8;
4545 } else if (todo >= 4) {
4546 op = aco_opcode::ds_read_b32;
4547 todo = 4;
4548 } else {
4549 assert(false);
4550 }
4551 assert(todo % elem_size_bytes == 0);
4552 unsigned num_elements = todo / elem_size_bytes;
4553 unsigned offset = nir_intrinsic_base(instr) + bytes_read;
4554 unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535;
4555
4556 Temp address_offset = address;
4557 if (offset > max_offset) {
4558 address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset);
4559 offset = bytes_read;
4560 }
4561 assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
4562
4563 Temp res;
4564 if (instr->num_components == 1 && dst.type() == RegType::vgpr)
4565 res = dst;
4566 else
4567 res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
4568
4569 if (op == aco_opcode::ds_read2_b32)
4570 res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
4571 else
4572 res = bld.ds(op, Definition(res), address_offset, m, offset);
4573
4574 if (instr->num_components == 1) {
4575 assert(todo == total_bytes);
4576 if (dst.type() == RegType::sgpr)
4577 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4578 return;
4579 }
4580
4581 if (dst.type() == RegType::sgpr)
4582 res = bld.as_uniform(res);
4583
4584 if (num_elements == 1) {
4585 result[result_size++] = res;
4586 } else {
4587 assert(res != dst && res.size() % num_elements == 0);
4588 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
4589 split->operands[0] = Operand(res);
4590 for (unsigned i = 0; i < num_elements; i++)
4591 split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
4592 ctx->block->instructions.emplace_back(std::move(split));
4593 }
4594
4595 bytes_read += todo;
4596 }
4597
4598 assert(result_size == instr->num_components && result_size > 1);
4599 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
4600 for (unsigned i = 0; i < result_size; i++)
4601 vec->operands[i] = Operand(result[i]);
4602 vec->definitions[0] = Definition(dst);
4603 ctx->block->instructions.emplace_back(std::move(vec));
4604 ctx->allocated_vec.emplace(dst.id(), result);
4605 }
4606
4607 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align)
4608 {
4609 Builder bld(ctx->program, ctx->block);
4610 unsigned bytes_written = 0;
4611 while (bytes_written < data.size() * 4) {
4612 unsigned todo = data.size() * 4 - bytes_written;
4613 bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
4614 bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
4615
4616 aco_opcode op = aco_opcode::last_opcode;
4617 unsigned size = 0;
4618 if (todo >= 16 && aligned16) {
4619 op = aco_opcode::ds_write_b128;
4620 size = 4;
4621 } else if (todo >= 12 && aligned16) {
4622 op = aco_opcode::ds_write_b96;
4623 size = 3;
4624 } else if (todo >= 8) {
4625 op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32;
4626 size = 2;
4627 } else if (todo >= 4) {
4628 op = aco_opcode::ds_write_b32;
4629 size = 1;
4630 } else {
4631 assert(false);
4632 }
4633
4634 bool write2 = op == aco_opcode::ds_write2_b32;
4635 unsigned offset = offset0 + offset1 + bytes_written;
4636 unsigned max_offset = write2 ? 1020 : 65535;
4637 Temp address_offset = address;
4638 if (offset > max_offset) {
4639 address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
4640 offset = offset1 + bytes_written;
4641 }
4642 assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
4643
4644 if (write2) {
4645 Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1);
4646 Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1);
4647 bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
4648 } else {
4649 Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size));
4650 bld.ds(op, address_offset, val, m, offset);
4651 }
4652
4653 bytes_written += size * 4;
4654 }
4655 }
4656
4657 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4658 {
4659 unsigned offset = nir_intrinsic_base(instr);
4660 unsigned writemask = nir_intrinsic_write_mask(instr);
4661 Operand m = load_lds_size_m0(ctx);
4662 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4663 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4664 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4665 assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4666
4667 /* we need at most two stores for 32bit variables */
4668 int start[2], count[2];
4669 u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]);
4670 u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]);
4671 assert(writemask == 0);
4672
4673 /* one combined store is sufficient */
4674 if (count[0] == count[1]) {
4675 Builder bld(ctx->program, ctx->block);
4676
4677 Temp address_offset = address;
4678 if ((offset >> 2) + start[1] > 255) {
4679 address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset);
4680 offset = 0;
4681 }
4682
4683 assert(count[0] == 1);
4684 Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
4685 Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
4686 aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4687 offset = offset / elem_size_bytes;
4688 bld.ds(op, address_offset, val0, val1, m,
4689 offset + start[0], offset + start[1]);
4690 return;
4691 }
4692
4693 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4694 for (unsigned i = 0; i < 2; i++) {
4695 if (count[i] == 0)
4696 continue;
4697
4698 Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4));
4699 ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align);
4700 }
4701 return;
4702 }
4703
4704 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4705 {
4706 unsigned offset = nir_intrinsic_base(instr);
4707 Operand m = load_lds_size_m0(ctx);
4708 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4709 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4710
4711 unsigned num_operands = 3;
4712 aco_opcode op32, op64, op32_rtn, op64_rtn;
4713 switch(instr->intrinsic) {
4714 case nir_intrinsic_shared_atomic_add:
4715 op32 = aco_opcode::ds_add_u32;
4716 op64 = aco_opcode::ds_add_u64;
4717 op32_rtn = aco_opcode::ds_add_rtn_u32;
4718 op64_rtn = aco_opcode::ds_add_rtn_u64;
4719 break;
4720 case nir_intrinsic_shared_atomic_imin:
4721 op32 = aco_opcode::ds_min_i32;
4722 op64 = aco_opcode::ds_min_i64;
4723 op32_rtn = aco_opcode::ds_min_rtn_i32;
4724 op64_rtn = aco_opcode::ds_min_rtn_i64;
4725 break;
4726 case nir_intrinsic_shared_atomic_umin:
4727 op32 = aco_opcode::ds_min_u32;
4728 op64 = aco_opcode::ds_min_u64;
4729 op32_rtn = aco_opcode::ds_min_rtn_u32;
4730 op64_rtn = aco_opcode::ds_min_rtn_u64;
4731 break;
4732 case nir_intrinsic_shared_atomic_imax:
4733 op32 = aco_opcode::ds_max_i32;
4734 op64 = aco_opcode::ds_max_i64;
4735 op32_rtn = aco_opcode::ds_max_rtn_i32;
4736 op64_rtn = aco_opcode::ds_max_rtn_i64;
4737 break;
4738 case nir_intrinsic_shared_atomic_umax:
4739 op32 = aco_opcode::ds_max_u32;
4740 op64 = aco_opcode::ds_max_u64;
4741 op32_rtn = aco_opcode::ds_max_rtn_u32;
4742 op64_rtn = aco_opcode::ds_max_rtn_u64;
4743 break;
4744 case nir_intrinsic_shared_atomic_and:
4745 op32 = aco_opcode::ds_and_b32;
4746 op64 = aco_opcode::ds_and_b64;
4747 op32_rtn = aco_opcode::ds_and_rtn_b32;
4748 op64_rtn = aco_opcode::ds_and_rtn_b64;
4749 break;
4750 case nir_intrinsic_shared_atomic_or:
4751 op32 = aco_opcode::ds_or_b32;
4752 op64 = aco_opcode::ds_or_b64;
4753 op32_rtn = aco_opcode::ds_or_rtn_b32;
4754 op64_rtn = aco_opcode::ds_or_rtn_b64;
4755 break;
4756 case nir_intrinsic_shared_atomic_xor:
4757 op32 = aco_opcode::ds_xor_b32;
4758 op64 = aco_opcode::ds_xor_b64;
4759 op32_rtn = aco_opcode::ds_xor_rtn_b32;
4760 op64_rtn = aco_opcode::ds_xor_rtn_b64;
4761 break;
4762 case nir_intrinsic_shared_atomic_exchange:
4763 op32 = aco_opcode::ds_write_b32;
4764 op64 = aco_opcode::ds_write_b64;
4765 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4766 op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4767 break;
4768 case nir_intrinsic_shared_atomic_comp_swap:
4769 op32 = aco_opcode::ds_cmpst_b32;
4770 op64 = aco_opcode::ds_cmpst_b64;
4771 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4772 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4773 num_operands = 4;
4774 break;
4775 default:
4776 unreachable("Unhandled shared atomic intrinsic");
4777 }
4778
4779 /* return the previous value if dest is ever used */
4780 bool return_previous = false;
4781 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4782 return_previous = true;
4783 break;
4784 }
4785 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4786 return_previous = true;
4787 break;
4788 }
4789
4790 aco_opcode op;
4791 if (data.size() == 1) {
4792 assert(instr->dest.ssa.bit_size == 32);
4793 op = return_previous ? op32_rtn : op32;
4794 } else {
4795 assert(instr->dest.ssa.bit_size == 64);
4796 op = return_previous ? op64_rtn : op64;
4797 }
4798
4799 if (offset > 65535) {
4800 Builder bld(ctx->program, ctx->block);
4801 address = bld.vadd32(bld.def(v1), Operand(offset), address);
4802 offset = 0;
4803 }
4804
4805 aco_ptr<DS_instruction> ds;
4806 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4807 ds->operands[0] = Operand(address);
4808 ds->operands[1] = Operand(data);
4809 if (num_operands == 4)
4810 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4811 ds->operands[num_operands - 1] = m;
4812 ds->offset0 = offset;
4813 if (return_previous)
4814 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4815 ctx->block->instructions.emplace_back(std::move(ds));
4816 }
4817
4818 Temp get_scratch_resource(isel_context *ctx)
4819 {
4820 Builder bld(ctx->program, ctx->block);
4821 Temp scratch_addr = ctx->private_segment_buffer;
4822 if (ctx->stage != compute_cs)
4823 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4824
4825 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
4826 S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
4827
4828 if (ctx->program->chip_class >= GFX10) {
4829 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4830 S_008F0C_OOB_SELECT(3) |
4831 S_008F0C_RESOURCE_LEVEL(1);
4832 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
4833 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4834 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4835 }
4836
4837 /* older generations need element size = 16 bytes. element size removed in GFX9 */
4838 if (ctx->program->chip_class <= GFX8)
4839 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
4840
4841 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4842 }
4843
4844 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4845 assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4846 Builder bld(ctx->program, ctx->block);
4847 Temp rsrc = get_scratch_resource(ctx);
4848 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4849 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4850
4851 aco_opcode op;
4852 switch (dst.size()) {
4853 case 1:
4854 op = aco_opcode::buffer_load_dword;
4855 break;
4856 case 2:
4857 op = aco_opcode::buffer_load_dwordx2;
4858 break;
4859 case 3:
4860 op = aco_opcode::buffer_load_dwordx3;
4861 break;
4862 case 4:
4863 op = aco_opcode::buffer_load_dwordx4;
4864 break;
4865 case 6:
4866 case 8: {
4867 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4868 Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4869 bld.def(v4), offset, rsrc,
4870 ctx->scratch_offset, 0, true);
4871 Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4872 aco_opcode::buffer_load_dwordx4,
4873 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4874 offset, rsrc, ctx->scratch_offset, 16, true);
4875 emit_split_vector(ctx, lower, 2);
4876 elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4877 elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4878 if (dst.size() == 8) {
4879 emit_split_vector(ctx, upper, 2);
4880 elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4881 elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4882 } else {
4883 elems[2] = upper;
4884 }
4885
4886 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4887 Format::PSEUDO, dst.size() / 2, 1)};
4888 for (unsigned i = 0; i < dst.size() / 2; i++)
4889 vec->operands[i] = Operand(elems[i]);
4890 vec->definitions[0] = Definition(dst);
4891 bld.insert(std::move(vec));
4892 ctx->allocated_vec.emplace(dst.id(), elems);
4893 return;
4894 }
4895 default:
4896 unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4897 }
4898
4899 bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4900 emit_split_vector(ctx, dst, instr->num_components);
4901 }
4902
4903 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4904 assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4905 Builder bld(ctx->program, ctx->block);
4906 Temp rsrc = get_scratch_resource(ctx);
4907 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4908 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4909
4910 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4911 unsigned writemask = nir_intrinsic_write_mask(instr);
4912
4913 while (writemask) {
4914 int start, count;
4915 u_bit_scan_consecutive_range(&writemask, &start, &count);
4916 int num_bytes = count * elem_size_bytes;
4917
4918 if (num_bytes > 16) {
4919 assert(elem_size_bytes == 8);
4920 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4921 count = 2;
4922 num_bytes = 16;
4923 }
4924
4925 // TODO: check alignment of sub-dword stores
4926 // TODO: split 3 bytes. there is no store instruction for that
4927
4928 Temp write_data;
4929 if (count != instr->num_components) {
4930 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4931 for (int i = 0; i < count; i++) {
4932 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
4933 vec->operands[i] = Operand(elem);
4934 }
4935 write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
4936 vec->definitions[0] = Definition(write_data);
4937 ctx->block->instructions.emplace_back(std::move(vec));
4938 } else {
4939 write_data = data;
4940 }
4941
4942 aco_opcode op;
4943 switch (num_bytes) {
4944 case 4:
4945 op = aco_opcode::buffer_store_dword;
4946 break;
4947 case 8:
4948 op = aco_opcode::buffer_store_dwordx2;
4949 break;
4950 case 12:
4951 op = aco_opcode::buffer_store_dwordx3;
4952 break;
4953 case 16:
4954 op = aco_opcode::buffer_store_dwordx4;
4955 break;
4956 default:
4957 unreachable("Invalid data size for nir_intrinsic_store_scratch.");
4958 }
4959
4960 bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
4961 }
4962 }
4963
4964 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
4965 uint8_t log2_ps_iter_samples;
4966 if (ctx->program->info->ps.force_persample) {
4967 log2_ps_iter_samples =
4968 util_logbase2(ctx->options->key.fs.num_samples);
4969 } else {
4970 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
4971 }
4972
4973 /* The bit pattern matches that used by fixed function fragment
4974 * processing. */
4975 static const unsigned ps_iter_masks[] = {
4976 0xffff, /* not used */
4977 0x5555,
4978 0x1111,
4979 0x0101,
4980 0x0001,
4981 };
4982 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
4983
4984 Builder bld(ctx->program, ctx->block);
4985
4986 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
4987 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
4988 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
4989 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4990 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
4991 }
4992
4993 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
4994 {
4995 Builder bld(ctx->program, ctx->block);
4996
4997 if (cluster_size == 1) {
4998 return src;
4999 } if (op == nir_op_iand && cluster_size == 4) {
5000 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5001 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5002 return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
5003 bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
5004 } else if (op == nir_op_ior && cluster_size == 4) {
5005 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5006 return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5007 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5008 } else if (op == nir_op_iand && cluster_size == 64) {
5009 //subgroupAnd(val) -> (exec & ~val) == 0
5010 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5011 return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
5012 } else if (op == nir_op_ior && cluster_size == 64) {
5013 //subgroupOr(val) -> (val & exec) != 0
5014 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5015 } else if (op == nir_op_ixor && cluster_size == 64) {
5016 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5017 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5018 tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5019 return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5020 } else {
5021 //subgroupClustered{And,Or,Xor}(val, n) ->
5022 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5023 //cluster_offset = ~(n - 1) & lane_id
5024 //cluster_mask = ((1 << n) - 1)
5025 //subgroupClusteredAnd():
5026 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5027 //subgroupClusteredOr():
5028 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
5029 //subgroupClusteredXor():
5030 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5031 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5032 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5033 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5034
5035 Temp tmp;
5036 if (op == nir_op_iand)
5037 tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5038 else
5039 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5040
5041 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5042 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5043 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5044 if (cluster_mask != 0xffffffff)
5045 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5046
5047 Definition cmp_def = Definition();
5048 if (op == nir_op_iand) {
5049 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5050 } else if (op == nir_op_ior) {
5051 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5052 } else if (op == nir_op_ixor) {
5053 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5054 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5055 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5056 }
5057 cmp_def.setHint(vcc);
5058 return cmp_def.getTemp();
5059 }
5060 }
5061
5062 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5063 {
5064 Builder bld(ctx->program, ctx->block);
5065
5066 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5067 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5068 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5069 Temp tmp;
5070 if (op == nir_op_iand)
5071 tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5072 else
5073 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5074
5075 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5076 Temp lo = lohi.def(0).getTemp();
5077 Temp hi = lohi.def(1).getTemp();
5078 Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5079 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5080
5081 Definition cmp_def = Definition();
5082 if (op == nir_op_iand)
5083 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5084 else if (op == nir_op_ior)
5085 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5086 else if (op == nir_op_ixor)
5087 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5088 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5089 cmp_def.setHint(vcc);
5090 return cmp_def.getTemp();
5091 }
5092
5093 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5094 {
5095 Builder bld(ctx->program, ctx->block);
5096
5097 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5098 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5099 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5100 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5101 if (op == nir_op_iand)
5102 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5103 else if (op == nir_op_ior)
5104 return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5105 else if (op == nir_op_ixor)
5106 return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5107
5108 assert(false);
5109 return Temp();
5110 }
5111
5112 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5113 {
5114 Builder bld(ctx->program, ctx->block);
5115 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5116 if (src.regClass().type() == RegType::vgpr) {
5117 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5118 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5119 bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5120 } else if (src.regClass() == s1) {
5121 bld.sop1(aco_opcode::s_mov_b32, dst, src);
5122 } else if (src.regClass() == s2) {
5123 bld.sop1(aco_opcode::s_mov_b64, dst, src);
5124 } else {
5125 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5126 nir_print_instr(&instr->instr, stderr);
5127 fprintf(stderr, "\n");
5128 }
5129 }
5130
5131 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5132 {
5133 Builder bld(ctx->program, ctx->block);
5134 Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5135 Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5136
5137 /* Build DD X/Y */
5138 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5139 Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5140 Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5141 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5142 Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5143 Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5144
5145 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5146 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5147 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5148 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5149 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5150 Temp wqm1 = bld.tmp(v1);
5151 emit_wqm(ctx, tmp1, wqm1, true);
5152 Temp wqm2 = bld.tmp(v1);
5153 emit_wqm(ctx, tmp2, wqm2, true);
5154 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5155 return;
5156 }
5157
5158 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5159 {
5160 Builder bld(ctx->program, ctx->block);
5161 switch(instr->intrinsic) {
5162 case nir_intrinsic_load_barycentric_sample:
5163 case nir_intrinsic_load_barycentric_pixel:
5164 case nir_intrinsic_load_barycentric_centroid: {
5165 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5166 fs_input input = get_interp_input(instr->intrinsic, mode);
5167
5168 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5169 if (input == fs_input::max_inputs) {
5170 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5171 Operand(0u), Operand(0u));
5172 } else {
5173 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5174 ctx->fs_inputs[input],
5175 ctx->fs_inputs[input + 1]);
5176 }
5177 emit_split_vector(ctx, dst, 2);
5178 break;
5179 }
5180 case nir_intrinsic_load_barycentric_at_sample: {
5181 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5182 switch (ctx->options->key.fs.num_samples) {
5183 case 2: sample_pos_offset += 1 << 3; break;
5184 case 4: sample_pos_offset += 3 << 3; break;
5185 case 8: sample_pos_offset += 7 << 3; break;
5186 default: break;
5187 }
5188 Temp sample_pos;
5189 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5190 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5191 if (addr.type() == RegType::sgpr) {
5192 Operand offset;
5193 if (const_addr) {
5194 sample_pos_offset += const_addr->u32 << 3;
5195 offset = Operand(sample_pos_offset);
5196 } else if (ctx->options->chip_class >= GFX9) {
5197 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5198 } else {
5199 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5200 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5201 }
5202 addr = ctx->private_segment_buffer;
5203 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5204
5205 } else if (ctx->options->chip_class >= GFX9) {
5206 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5207 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5208 } else {
5209 /* addr += ctx->private_segment_buffer + sample_pos_offset */
5210 Temp tmp0 = bld.tmp(s1);
5211 Temp tmp1 = bld.tmp(s1);
5212 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5213 Definition scc_tmp = bld.def(s1, scc);
5214 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5215 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5216 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5217 Temp pck0 = bld.tmp(v1);
5218 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5219 tmp1 = as_vgpr(ctx, tmp1);
5220 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5221 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5222
5223 /* sample_pos = flat_load_dwordx2 addr */
5224 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5225 }
5226
5227 /* sample_pos -= 0.5 */
5228 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5229 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5230 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5231 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5232 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5233
5234 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5235 break;
5236 }
5237 case nir_intrinsic_load_barycentric_at_offset: {
5238 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5239 RegClass rc = RegClass(offset.type(), 1);
5240 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5241 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5242 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5243 break;
5244 }
5245 case nir_intrinsic_load_front_face: {
5246 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5247 Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5248 break;
5249 }
5250 case nir_intrinsic_load_view_index:
5251 case nir_intrinsic_load_layer_id: {
5252 if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5253 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5254 bld.copy(Definition(dst), Operand(ctx->view_index));
5255 break;
5256 }
5257
5258 unsigned idx = nir_intrinsic_base(instr);
5259 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5260 Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5261 break;
5262 }
5263 case nir_intrinsic_load_frag_coord: {
5264 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5265 break;
5266 }
5267 case nir_intrinsic_load_sample_pos: {
5268 Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5269 Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5270 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5271 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5272 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5273 break;
5274 }
5275 case nir_intrinsic_load_interpolated_input:
5276 visit_load_interpolated_input(ctx, instr);
5277 break;
5278 case nir_intrinsic_store_output:
5279 visit_store_output(ctx, instr);
5280 break;
5281 case nir_intrinsic_load_input:
5282 visit_load_input(ctx, instr);
5283 break;
5284 case nir_intrinsic_load_ubo:
5285 visit_load_ubo(ctx, instr);
5286 break;
5287 case nir_intrinsic_load_push_constant:
5288 visit_load_push_constant(ctx, instr);
5289 break;
5290 case nir_intrinsic_load_constant:
5291 visit_load_constant(ctx, instr);
5292 break;
5293 case nir_intrinsic_vulkan_resource_index:
5294 visit_load_resource(ctx, instr);
5295 break;
5296 case nir_intrinsic_discard:
5297 visit_discard(ctx, instr);
5298 break;
5299 case nir_intrinsic_discard_if:
5300 visit_discard_if(ctx, instr);
5301 break;
5302 case nir_intrinsic_load_shared:
5303 visit_load_shared(ctx, instr);
5304 break;
5305 case nir_intrinsic_store_shared:
5306 visit_store_shared(ctx, instr);
5307 break;
5308 case nir_intrinsic_shared_atomic_add:
5309 case nir_intrinsic_shared_atomic_imin:
5310 case nir_intrinsic_shared_atomic_umin:
5311 case nir_intrinsic_shared_atomic_imax:
5312 case nir_intrinsic_shared_atomic_umax:
5313 case nir_intrinsic_shared_atomic_and:
5314 case nir_intrinsic_shared_atomic_or:
5315 case nir_intrinsic_shared_atomic_xor:
5316 case nir_intrinsic_shared_atomic_exchange:
5317 case nir_intrinsic_shared_atomic_comp_swap:
5318 visit_shared_atomic(ctx, instr);
5319 break;
5320 case nir_intrinsic_image_deref_load:
5321 visit_image_load(ctx, instr);
5322 break;
5323 case nir_intrinsic_image_deref_store:
5324 visit_image_store(ctx, instr);
5325 break;
5326 case nir_intrinsic_image_deref_atomic_add:
5327 case nir_intrinsic_image_deref_atomic_umin:
5328 case nir_intrinsic_image_deref_atomic_imin:
5329 case nir_intrinsic_image_deref_atomic_umax:
5330 case nir_intrinsic_image_deref_atomic_imax:
5331 case nir_intrinsic_image_deref_atomic_and:
5332 case nir_intrinsic_image_deref_atomic_or:
5333 case nir_intrinsic_image_deref_atomic_xor:
5334 case nir_intrinsic_image_deref_atomic_exchange:
5335 case nir_intrinsic_image_deref_atomic_comp_swap:
5336 visit_image_atomic(ctx, instr);
5337 break;
5338 case nir_intrinsic_image_deref_size:
5339 visit_image_size(ctx, instr);
5340 break;
5341 case nir_intrinsic_load_ssbo:
5342 visit_load_ssbo(ctx, instr);
5343 break;
5344 case nir_intrinsic_store_ssbo:
5345 visit_store_ssbo(ctx, instr);
5346 break;
5347 case nir_intrinsic_load_global:
5348 visit_load_global(ctx, instr);
5349 break;
5350 case nir_intrinsic_store_global:
5351 visit_store_global(ctx, instr);
5352 break;
5353 case nir_intrinsic_ssbo_atomic_add:
5354 case nir_intrinsic_ssbo_atomic_imin:
5355 case nir_intrinsic_ssbo_atomic_umin:
5356 case nir_intrinsic_ssbo_atomic_imax:
5357 case nir_intrinsic_ssbo_atomic_umax:
5358 case nir_intrinsic_ssbo_atomic_and:
5359 case nir_intrinsic_ssbo_atomic_or:
5360 case nir_intrinsic_ssbo_atomic_xor:
5361 case nir_intrinsic_ssbo_atomic_exchange:
5362 case nir_intrinsic_ssbo_atomic_comp_swap:
5363 visit_atomic_ssbo(ctx, instr);
5364 break;
5365 case nir_intrinsic_load_scratch:
5366 visit_load_scratch(ctx, instr);
5367 break;
5368 case nir_intrinsic_store_scratch:
5369 visit_store_scratch(ctx, instr);
5370 break;
5371 case nir_intrinsic_get_buffer_size:
5372 visit_get_buffer_size(ctx, instr);
5373 break;
5374 case nir_intrinsic_barrier: {
5375 unsigned* bsize = ctx->program->info->cs.block_size;
5376 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5377 if (workgroup_size > 64)
5378 bld.sopp(aco_opcode::s_barrier);
5379 break;
5380 }
5381 case nir_intrinsic_group_memory_barrier:
5382 case nir_intrinsic_memory_barrier:
5383 case nir_intrinsic_memory_barrier_atomic_counter:
5384 case nir_intrinsic_memory_barrier_buffer:
5385 case nir_intrinsic_memory_barrier_image:
5386 case nir_intrinsic_memory_barrier_shared:
5387 emit_memory_barrier(ctx, instr);
5388 break;
5389 case nir_intrinsic_load_num_work_groups:
5390 case nir_intrinsic_load_work_group_id:
5391 case nir_intrinsic_load_local_invocation_id: {
5392 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5393 Temp* ids;
5394 if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5395 ids = ctx->num_workgroups;
5396 else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5397 ids = ctx->workgroup_ids;
5398 else
5399 ids = ctx->local_invocation_ids;
5400 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5401 ids[0].id() ? Operand(ids[0]) : Operand(1u),
5402 ids[1].id() ? Operand(ids[1]) : Operand(1u),
5403 ids[2].id() ? Operand(ids[2]) : Operand(1u));
5404 emit_split_vector(ctx, dst, 3);
5405 break;
5406 }
5407 case nir_intrinsic_load_local_invocation_index: {
5408 Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5409 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5410 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5411 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5412 break;
5413 }
5414 case nir_intrinsic_load_subgroup_id: {
5415 if (ctx->stage == compute_cs) {
5416 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5417 bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5418 } else {
5419 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5420 }
5421 break;
5422 }
5423 case nir_intrinsic_load_subgroup_invocation: {
5424 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5425 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5426 break;
5427 }
5428 case nir_intrinsic_load_num_subgroups: {
5429 if (ctx->stage == compute_cs)
5430 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5431 else
5432 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5433 break;
5434 }
5435 case nir_intrinsic_ballot: {
5436 Definition tmp = bld.def(s2);
5437 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5438 if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5439 bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5440 } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5441 bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5442 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5443 bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5444 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5445 bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5446 } else {
5447 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5448 nir_print_instr(&instr->instr, stderr);
5449 fprintf(stderr, "\n");
5450 }
5451 emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5452 break;
5453 }
5454 case nir_intrinsic_shuffle: {
5455 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5456 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5457 emit_uniform_subgroup(ctx, instr, src);
5458 } else {
5459 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5460 assert(tid.regClass() == v1);
5461 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5462 if (src.regClass() == v1) {
5463 tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5464 emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5465 } else if (src.regClass() == v2) {
5466 tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5467
5468 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5469 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5470 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5471 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5472 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5473 emit_split_vector(ctx, dst, 2);
5474 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5475 Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5476 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5477 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5478 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5479 } else {
5480 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5481 nir_print_instr(&instr->instr, stderr);
5482 fprintf(stderr, "\n");
5483 }
5484 }
5485 break;
5486 }
5487 case nir_intrinsic_load_sample_id: {
5488 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5489 ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5490 break;
5491 }
5492 case nir_intrinsic_load_sample_mask_in: {
5493 visit_load_sample_mask_in(ctx, instr);
5494 break;
5495 }
5496 case nir_intrinsic_read_first_invocation: {
5497 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5498 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5499 if (src.regClass() == v1) {
5500 emit_wqm(ctx,
5501 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5502 dst);
5503 } else if (src.regClass() == v2) {
5504 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5505 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5506 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5507 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5508 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5509 emit_split_vector(ctx, dst, 2);
5510 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5511 emit_wqm(ctx,
5512 bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5513 bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5514 dst);
5515 } else if (src.regClass() == s1) {
5516 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5517 } else if (src.regClass() == s2) {
5518 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5519 } else {
5520 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5521 nir_print_instr(&instr->instr, stderr);
5522 fprintf(stderr, "\n");
5523 }
5524 break;
5525 }
5526 case nir_intrinsic_read_invocation: {
5527 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5528 Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5529 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5530 assert(lane.regClass() == s1);
5531 if (src.regClass() == v1) {
5532 emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5533 } else if (src.regClass() == v2) {
5534 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5535 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5536 lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5537 hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5538 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5539 emit_split_vector(ctx, dst, 2);
5540 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5541 emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5542 } else if (src.regClass() == s1) {
5543 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5544 } else if (src.regClass() == s2) {
5545 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5546 } else {
5547 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5548 nir_print_instr(&instr->instr, stderr);
5549 fprintf(stderr, "\n");
5550 }
5551 break;
5552 }
5553 case nir_intrinsic_vote_all: {
5554 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5555 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5556 assert(src.regClass() == s2);
5557 assert(dst.regClass() == s1);
5558
5559 Definition tmp = bld.def(s1);
5560 bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5561 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5562 Operand(exec, s2));
5563 emit_wqm(ctx, tmp.getTemp(), dst);
5564 break;
5565 }
5566 case nir_intrinsic_vote_any: {
5567 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5568 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5569 assert(src.regClass() == s2);
5570 assert(dst.regClass() == s1);
5571
5572 Definition tmp = bld.def(s1);
5573 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5574 emit_wqm(ctx, tmp.getTemp(), dst);
5575 break;
5576 }
5577 case nir_intrinsic_reduce:
5578 case nir_intrinsic_inclusive_scan:
5579 case nir_intrinsic_exclusive_scan: {
5580 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5581 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5582 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5583 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5584 nir_intrinsic_cluster_size(instr) : 0;
5585 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5586
5587 if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5588 emit_uniform_subgroup(ctx, instr, src);
5589 } else if (instr->dest.ssa.bit_size == 1) {
5590 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5591 op = nir_op_iand;
5592 else if (op == nir_op_iadd)
5593 op = nir_op_ixor;
5594 else if (op == nir_op_umax || op == nir_op_imax)
5595 op = nir_op_ior;
5596 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5597
5598 switch (instr->intrinsic) {
5599 case nir_intrinsic_reduce:
5600 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5601 break;
5602 case nir_intrinsic_exclusive_scan:
5603 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5604 break;
5605 case nir_intrinsic_inclusive_scan:
5606 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5607 break;
5608 default:
5609 assert(false);
5610 }
5611 } else if (cluster_size == 1) {
5612 bld.copy(Definition(dst), src);
5613 } else {
5614 src = as_vgpr(ctx, src);
5615
5616 ReduceOp reduce_op;
5617 switch (op) {
5618 #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5619 CASE(iadd)
5620 CASE(imul)
5621 CASE(fadd)
5622 CASE(fmul)
5623 CASE(imin)
5624 CASE(umin)
5625 CASE(fmin)
5626 CASE(imax)
5627 CASE(umax)
5628 CASE(fmax)
5629 CASE(iand)
5630 CASE(ior)
5631 CASE(ixor)
5632 default:
5633 unreachable("unknown reduction op");
5634 #undef CASE
5635 }
5636
5637 aco_opcode aco_op;
5638 switch (instr->intrinsic) {
5639 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5640 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5641 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5642 default:
5643 unreachable("unknown reduce intrinsic");
5644 }
5645
5646 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5647 reduce->operands[0] = Operand(src);
5648 // filled in by aco_reduce_assign.cpp, used internally as part of the
5649 // reduce sequence
5650 assert(dst.size() == 1 || dst.size() == 2);
5651 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5652 reduce->operands[2] = Operand(v1.as_linear());
5653
5654 Temp tmp_dst = bld.tmp(dst.regClass());
5655 reduce->definitions[0] = Definition(tmp_dst);
5656 reduce->definitions[1] = bld.def(s2); // used internally
5657 reduce->definitions[2] = Definition();
5658 reduce->definitions[3] = Definition(scc, s1);
5659 reduce->definitions[4] = Definition();
5660 reduce->reduce_op = reduce_op;
5661 reduce->cluster_size = cluster_size;
5662 ctx->block->instructions.emplace_back(std::move(reduce));
5663
5664 emit_wqm(ctx, tmp_dst, dst);
5665 }
5666 break;
5667 }
5668 case nir_intrinsic_quad_broadcast: {
5669 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5670 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5671 emit_uniform_subgroup(ctx, instr, src);
5672 } else {
5673 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5674 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5675 if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5676 uint32_t half_mask = 0x11111111u << lane;
5677 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5678 Temp tmp = bld.tmp(s2);
5679 bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5680 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5681 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5682 emit_wqm(ctx, tmp, dst);
5683 } else if (instr->dest.ssa.bit_size == 32) {
5684 emit_wqm(ctx,
5685 bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5686 dpp_quad_perm(lane, lane, lane, lane)),
5687 dst);
5688 } else if (instr->dest.ssa.bit_size == 64) {
5689 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5690 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5691 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5692 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5693 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5694 emit_split_vector(ctx, dst, 2);
5695 } else {
5696 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5697 nir_print_instr(&instr->instr, stderr);
5698 fprintf(stderr, "\n");
5699 }
5700 }
5701 break;
5702 }
5703 case nir_intrinsic_quad_swap_horizontal:
5704 case nir_intrinsic_quad_swap_vertical:
5705 case nir_intrinsic_quad_swap_diagonal:
5706 case nir_intrinsic_quad_swizzle_amd: {
5707 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5708 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5709 emit_uniform_subgroup(ctx, instr, src);
5710 break;
5711 }
5712 uint16_t dpp_ctrl = 0;
5713 switch (instr->intrinsic) {
5714 case nir_intrinsic_quad_swap_horizontal:
5715 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5716 break;
5717 case nir_intrinsic_quad_swap_vertical:
5718 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5719 break;
5720 case nir_intrinsic_quad_swap_diagonal:
5721 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5722 break;
5723 case nir_intrinsic_quad_swizzle_amd: {
5724 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5725 break;
5726 }
5727 default:
5728 break;
5729 }
5730
5731 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5732 if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5733 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5734 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5735 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5736 emit_wqm(ctx, tmp, dst);
5737 } else if (instr->dest.ssa.bit_size == 32) {
5738 Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5739 emit_wqm(ctx, tmp, dst);
5740 } else if (instr->dest.ssa.bit_size == 64) {
5741 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5742 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5743 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5744 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5745 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5746 emit_split_vector(ctx, dst, 2);
5747 } else {
5748 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5749 nir_print_instr(&instr->instr, stderr);
5750 fprintf(stderr, "\n");
5751 }
5752 break;
5753 }
5754 case nir_intrinsic_masked_swizzle_amd: {
5755 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5756 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5757 emit_uniform_subgroup(ctx, instr, src);
5758 break;
5759 }
5760 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5761 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5762 if (dst.regClass() == v1) {
5763 emit_wqm(ctx,
5764 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5765 dst);
5766 } else if (dst.regClass() == v2) {
5767 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5768 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5769 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5770 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5771 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5772 emit_split_vector(ctx, dst, 2);
5773 } else {
5774 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5775 nir_print_instr(&instr->instr, stderr);
5776 fprintf(stderr, "\n");
5777 }
5778 break;
5779 }
5780 case nir_intrinsic_write_invocation_amd: {
5781 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5782 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5783 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5784 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5785 if (dst.regClass() == v1) {
5786 /* src2 is ignored for writelane. RA assigns the same reg for dst */
5787 emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5788 } else if (dst.regClass() == v2) {
5789 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5790 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5791 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5792 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5793 Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5794 Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5795 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5796 emit_split_vector(ctx, dst, 2);
5797 } else {
5798 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5799 nir_print_instr(&instr->instr, stderr);
5800 fprintf(stderr, "\n");
5801 }
5802 break;
5803 }
5804 case nir_intrinsic_mbcnt_amd: {
5805 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5806 RegClass rc = RegClass(src.type(), 1);
5807 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5808 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5809 Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5810 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5811 Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5812 emit_wqm(ctx, wqm_tmp, dst);
5813 break;
5814 }
5815 case nir_intrinsic_load_helper_invocation: {
5816 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5817 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5818 ctx->block->kind |= block_kind_needs_lowering;
5819 ctx->program->needs_exact = true;
5820 break;
5821 }
5822 case nir_intrinsic_is_helper_invocation: {
5823 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5824 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5825 ctx->block->kind |= block_kind_needs_lowering;
5826 ctx->program->needs_exact = true;
5827 break;
5828 }
5829 case nir_intrinsic_demote:
5830 bld.pseudo(aco_opcode::p_demote_to_helper);
5831 ctx->block->kind |= block_kind_uses_demote;
5832 ctx->program->needs_exact = true;
5833 break;
5834 case nir_intrinsic_demote_if: {
5835 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5836 as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5837 Operand(exec, s2));
5838 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5839 ctx->block->kind |= block_kind_uses_demote;
5840 ctx->program->needs_exact = true;
5841 break;
5842 }
5843 case nir_intrinsic_first_invocation: {
5844 emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5845 get_ssa_temp(ctx, &instr->dest.ssa));
5846 break;
5847 }
5848 case nir_intrinsic_shader_clock:
5849 bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
5850 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
5851 break;
5852 case nir_intrinsic_load_vertex_id_zero_base: {
5853 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5854 bld.copy(Definition(dst), ctx->vertex_id);
5855 break;
5856 }
5857 case nir_intrinsic_load_first_vertex: {
5858 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5859 bld.copy(Definition(dst), ctx->base_vertex);
5860 break;
5861 }
5862 case nir_intrinsic_load_base_instance: {
5863 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5864 bld.copy(Definition(dst), ctx->start_instance);
5865 break;
5866 }
5867 case nir_intrinsic_load_instance_id: {
5868 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5869 bld.copy(Definition(dst), ctx->instance_id);
5870 break;
5871 }
5872 case nir_intrinsic_load_draw_id: {
5873 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5874 bld.copy(Definition(dst), ctx->draw_id);
5875 break;
5876 }
5877 default:
5878 fprintf(stderr, "Unimplemented intrinsic instr: ");
5879 nir_print_instr(&instr->instr, stderr);
5880 fprintf(stderr, "\n");
5881 abort();
5882
5883 break;
5884 }
5885 }
5886
5887
5888 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5889 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5890 enum glsl_base_type *stype)
5891 {
5892 nir_deref_instr *texture_deref_instr = NULL;
5893 nir_deref_instr *sampler_deref_instr = NULL;
5894 int plane = -1;
5895
5896 for (unsigned i = 0; i < instr->num_srcs; i++) {
5897 switch (instr->src[i].src_type) {
5898 case nir_tex_src_texture_deref:
5899 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5900 break;
5901 case nir_tex_src_sampler_deref:
5902 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5903 break;
5904 case nir_tex_src_plane:
5905 plane = nir_src_as_int(instr->src[i].src);
5906 break;
5907 default:
5908 break;
5909 }
5910 }
5911
5912 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5913
5914 if (!sampler_deref_instr)
5915 sampler_deref_instr = texture_deref_instr;
5916
5917 if (plane >= 0) {
5918 assert(instr->op != nir_texop_txf_ms &&
5919 instr->op != nir_texop_samples_identical);
5920 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
5921 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5922 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
5923 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5924 } else {
5925 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5926 }
5927 if (samp_ptr) {
5928 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5929 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5930 fprintf(stderr, "Unimplemented sampler descriptor: ");
5931 nir_print_instr(&instr->instr, stderr);
5932 fprintf(stderr, "\n");
5933 abort();
5934 // TODO: build samp_ptr = and(samp_ptr, res_ptr)
5935 }
5936 }
5937 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
5938 instr->op == nir_texop_samples_identical))
5939 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
5940 }
5941
5942 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
5943 Temp *out_ma, Temp *out_sc, Temp *out_tc)
5944 {
5945 Builder bld(ctx->program, ctx->block);
5946
5947 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
5948 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
5949 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
5950
5951 Operand neg_one(0xbf800000u);
5952 Operand one(0x3f800000u);
5953 Operand two(0x40000000u);
5954 Operand four(0x40800000u);
5955
5956 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
5957 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
5958 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
5959
5960 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
5961 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
5962 is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
5963 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
5964
5965 // select sc
5966 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
5967 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
5968 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
5969 one, is_ma_y);
5970 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5971
5972 // select tc
5973 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
5974 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
5975 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5976
5977 // select ma
5978 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
5979 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
5980 deriv_z, is_ma_z);
5981 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
5982 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
5983 }
5984
5985 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
5986 {
5987 Builder bld(ctx->program, ctx->block);
5988 Temp coord_args[4], ma, tc, sc, id;
5989 for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
5990 coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
5991
5992 if (is_array) {
5993 coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
5994
5995 // see comment in ac_prepare_cube_coords()
5996 if (ctx->options->chip_class <= GFX8)
5997 coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
5998 }
5999
6000 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6001
6002 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6003 vop3a->operands[0] = Operand(ma);
6004 vop3a->abs[0] = true;
6005 Temp invma = bld.tmp(v1);
6006 vop3a->definitions[0] = Definition(invma);
6007 ctx->block->instructions.emplace_back(std::move(vop3a));
6008
6009 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6010 if (!is_deriv)
6011 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6012
6013 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6014 if (!is_deriv)
6015 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6016
6017 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6018
6019 if (is_deriv) {
6020 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6021 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6022
6023 for (unsigned i = 0; i < 2; i++) {
6024 // see comment in ac_prepare_cube_coords()
6025 Temp deriv_ma;
6026 Temp deriv_sc, deriv_tc;
6027 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6028 &deriv_ma, &deriv_sc, &deriv_tc);
6029
6030 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6031
6032 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6033 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6034 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6035 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6036 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6037 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6038 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6039 }
6040
6041 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6042 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6043 }
6044
6045 if (is_array)
6046 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6047 *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6048
6049 }
6050
6051 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6052 {
6053 Temp coord_vec[3];
6054 for (unsigned i = 0; i < coords.size(); i++)
6055 coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6056
6057 Builder bld(ctx->program, ctx->block);
6058 coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6059
6060 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6061 for (unsigned i = 0; i < coords.size(); i++)
6062 vec->operands[i] = Operand(coord_vec[i]);
6063 Temp res = bld.tmp(RegType::vgpr, coords.size());
6064 vec->definitions[0] = Definition(res);
6065 ctx->block->instructions.emplace_back(std::move(vec));
6066 return res;
6067 }
6068
6069 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6070 {
6071 if (vec->parent_instr->type != nir_instr_type_alu)
6072 return;
6073 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6074 if (vec_instr->op != nir_op_vec(vec->num_components))
6075 return;
6076
6077 for (unsigned i = 0; i < vec->num_components; i++) {
6078 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6079 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6080 }
6081 }
6082
6083 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6084 {
6085 Builder bld(ctx->program, ctx->block);
6086 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6087 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6088 Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6089 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6090 nir_const_value *sample_index_cv = NULL;
6091 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6092 enum glsl_base_type stype;
6093 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6094
6095 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6096 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6097 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6098 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6099
6100 for (unsigned i = 0; i < instr->num_srcs; i++) {
6101 switch (instr->src[i].src_type) {
6102 case nir_tex_src_coord:
6103 coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6104 break;
6105 case nir_tex_src_bias:
6106 if (instr->op == nir_texop_txb) {
6107 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6108 has_bias = true;
6109 }
6110 break;
6111 case nir_tex_src_lod: {
6112 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6113
6114 if (val && val->f32 <= 0.0) {
6115 level_zero = true;
6116 } else {
6117 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6118 has_lod = true;
6119 }
6120 break;
6121 }
6122 case nir_tex_src_comparator:
6123 if (instr->is_shadow) {
6124 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6125 has_compare = true;
6126 }
6127 break;
6128 case nir_tex_src_offset:
6129 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6130 get_const_vec(instr->src[i].src.ssa, const_offset);
6131 has_offset = true;
6132 break;
6133 case nir_tex_src_ddx:
6134 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6135 has_ddx = true;
6136 break;
6137 case nir_tex_src_ddy:
6138 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6139 has_ddy = true;
6140 break;
6141 case nir_tex_src_ms_index:
6142 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6143 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6144 has_sample_index = true;
6145 break;
6146 case nir_tex_src_texture_offset:
6147 case nir_tex_src_sampler_offset:
6148 default:
6149 break;
6150 }
6151 }
6152 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6153 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6154 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6155
6156 if (instr->op == nir_texop_texture_samples) {
6157 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6158
6159 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6160 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6161 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6162 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6163
6164 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6165 samples, Operand(1u), bld.scc(is_msaa));
6166 return;
6167 }
6168
6169 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6170 aco_ptr<Instruction> tmp_instr;
6171 Temp acc, pack = Temp();
6172
6173 uint32_t pack_const = 0;
6174 for (unsigned i = 0; i < offset.size(); i++) {
6175 if (!const_offset[i])
6176 continue;
6177 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6178 }
6179
6180 if (offset.type() == RegType::sgpr) {
6181 for (unsigned i = 0; i < offset.size(); i++) {
6182 if (const_offset[i])
6183 continue;
6184
6185 acc = emit_extract_vector(ctx, offset, i, s1);
6186 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6187
6188 if (i) {
6189 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6190 }
6191
6192 if (pack == Temp()) {
6193 pack = acc;
6194 } else {
6195 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6196 }
6197 }
6198
6199 if (pack_const && pack != Temp())
6200 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6201 } else {
6202 for (unsigned i = 0; i < offset.size(); i++) {
6203 if (const_offset[i])
6204 continue;
6205
6206 acc = emit_extract_vector(ctx, offset, i, v1);
6207 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6208
6209 if (i) {
6210 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6211 }
6212
6213 if (pack == Temp()) {
6214 pack = acc;
6215 } else {
6216 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6217 }
6218 }
6219
6220 if (pack_const && pack != Temp())
6221 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6222 }
6223 if (pack_const && pack == Temp())
6224 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6225 else if (pack == Temp())
6226 has_offset = false;
6227 else
6228 offset = pack;
6229 }
6230
6231 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6232 prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6233
6234 /* pack derivatives */
6235 if (has_ddx || has_ddy) {
6236 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6237 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6238 ddx, Operand(0u), ddy, Operand(0u));
6239 } else {
6240 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6241 }
6242 has_derivs = true;
6243 }
6244
6245 if (instr->coord_components > 1 &&
6246 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6247 instr->is_array &&
6248 instr->op != nir_texop_txf)
6249 coords = apply_round_slice(ctx, coords, 1);
6250
6251 if (instr->coord_components > 2 &&
6252 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6253 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6254 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6255 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6256 instr->is_array &&
6257 instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6258 coords = apply_round_slice(ctx, coords, 2);
6259
6260 if (ctx->options->chip_class == GFX9 &&
6261 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6262 instr->op != nir_texop_lod && instr->coord_components) {
6263 assert(coords.size() > 0 && coords.size() < 3);
6264
6265 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6266 vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6267 vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6268 if (coords.size() > 1)
6269 vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6270 coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6271 vec->definitions[0] = Definition(coords);
6272 ctx->block->instructions.emplace_back(std::move(vec));
6273 }
6274
6275 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6276
6277 if (instr->op == nir_texop_samples_identical)
6278 resource = fmask_ptr;
6279
6280 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6281 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6282 instr->op != nir_texop_txs) {
6283 assert(has_sample_index);
6284 Operand op(sample_index);
6285 if (sample_index_cv)
6286 op = Operand(sample_index_cv->u32);
6287 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6288 }
6289
6290 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6291 Temp split_coords[coords.size()];
6292 emit_split_vector(ctx, coords, coords.size());
6293 for (unsigned i = 0; i < coords.size(); i++)
6294 split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6295
6296 unsigned i = 0;
6297 for (; i < std::min(offset.size(), instr->coord_components); i++) {
6298 Temp off = emit_extract_vector(ctx, offset, i, v1);
6299 split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6300 }
6301
6302 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6303 for (unsigned i = 0; i < coords.size(); i++)
6304 vec->operands[i] = Operand(split_coords[i]);
6305 coords = bld.tmp(coords.regClass());
6306 vec->definitions[0] = Definition(coords);
6307 ctx->block->instructions.emplace_back(std::move(vec));
6308
6309 has_offset = false;
6310 }
6311
6312 /* Build tex instruction */
6313 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6314 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6315 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6316 : 0;
6317 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6318 Temp tmp_dst = dst;
6319
6320 /* gather4 selects the component by dmask and always returns vec4 */
6321 if (instr->op == nir_texop_tg4) {
6322 assert(instr->dest.ssa.num_components == 4);
6323 if (instr->is_shadow)
6324 dmask = 1;
6325 else
6326 dmask = 1 << instr->component;
6327 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6328 tmp_dst = bld.tmp(v4);
6329 } else if (instr->op == nir_texop_samples_identical) {
6330 tmp_dst = bld.tmp(v1);
6331 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6332 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6333 }
6334
6335 aco_ptr<MIMG_instruction> tex;
6336 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6337 if (!has_lod)
6338 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6339
6340 bool div_by_6 = instr->op == nir_texop_txs &&
6341 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6342 instr->is_array &&
6343 (dmask & (1 << 2));
6344 if (tmp_dst.id() == dst.id() && div_by_6)
6345 tmp_dst = bld.tmp(tmp_dst.regClass());
6346
6347 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6348 tex->operands[0] = Operand(as_vgpr(ctx,lod));
6349 tex->operands[1] = Operand(resource);
6350 if (ctx->options->chip_class == GFX9 &&
6351 instr->op == nir_texop_txs &&
6352 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6353 instr->is_array) {
6354 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6355 } else if (instr->op == nir_texop_query_levels) {
6356 tex->dmask = 1 << 3;
6357 } else {
6358 tex->dmask = dmask;
6359 }
6360 tex->da = da;
6361 tex->definitions[0] = Definition(tmp_dst);
6362 tex->dim = dim;
6363 tex->can_reorder = true;
6364 ctx->block->instructions.emplace_back(std::move(tex));
6365
6366 if (div_by_6) {
6367 /* divide 3rd value by 6 by multiplying with magic number */
6368 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6369 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6370 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6371 assert(instr->dest.ssa.num_components == 3);
6372 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6373 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6374 emit_extract_vector(ctx, tmp_dst, 0, v1),
6375 emit_extract_vector(ctx, tmp_dst, 1, v1),
6376 by_6);
6377
6378 }
6379
6380 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6381 return;
6382 }
6383
6384 Temp tg4_compare_cube_wa64 = Temp();
6385
6386 if (tg4_integer_workarounds) {
6387 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6388 tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6389 tex->operands[1] = Operand(resource);
6390 tex->dim = dim;
6391 tex->dmask = 0x3;
6392 tex->da = da;
6393 Temp size = bld.tmp(v2);
6394 tex->definitions[0] = Definition(size);
6395 tex->can_reorder = true;
6396 ctx->block->instructions.emplace_back(std::move(tex));
6397 emit_split_vector(ctx, size, size.size());
6398
6399 Temp half_texel[2];
6400 for (unsigned i = 0; i < 2; i++) {
6401 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6402 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6403 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6404 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6405 }
6406
6407 Temp orig_coords[2] = {
6408 emit_extract_vector(ctx, coords, 0, v1),
6409 emit_extract_vector(ctx, coords, 1, v1)};
6410 Temp new_coords[2] = {
6411 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6412 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6413 };
6414
6415 if (tg4_integer_cube_workaround) {
6416 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6417 Temp desc[resource.size()];
6418 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6419 Format::PSEUDO, 1, resource.size())};
6420 split->operands[0] = Operand(resource);
6421 for (unsigned i = 0; i < resource.size(); i++) {
6422 desc[i] = bld.tmp(s1);
6423 split->definitions[i] = Definition(desc[i]);
6424 }
6425 ctx->block->instructions.emplace_back(std::move(split));
6426
6427 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6428 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6429 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6430
6431 Temp nfmt;
6432 if (stype == GLSL_TYPE_UINT) {
6433 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6434 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6435 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6436 bld.scc(compare_cube_wa));
6437 } else {
6438 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6439 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6440 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6441 bld.scc(compare_cube_wa));
6442 }
6443 tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6444 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6445
6446 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6447 Operand((uint32_t)C_008F14_NUM_FORMAT));
6448 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6449
6450 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6451 Format::PSEUDO, resource.size(), 1)};
6452 for (unsigned i = 0; i < resource.size(); i++)
6453 vec->operands[i] = Operand(desc[i]);
6454 resource = bld.tmp(resource.regClass());
6455 vec->definitions[0] = Definition(resource);
6456 ctx->block->instructions.emplace_back(std::move(vec));
6457
6458 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6459 new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6460 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6461 new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6462 }
6463
6464 if (coords.size() == 3) {
6465 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6466 new_coords[0], new_coords[1],
6467 emit_extract_vector(ctx, coords, 2, v1));
6468 } else {
6469 assert(coords.size() == 2);
6470 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6471 new_coords[0], new_coords[1]);
6472 }
6473 }
6474
6475 if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6476 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6477 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6478 coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6479
6480 std::vector<Operand> args;
6481 if (has_offset)
6482 args.emplace_back(Operand(offset));
6483 if (has_bias)
6484 args.emplace_back(Operand(bias));
6485 if (has_compare)
6486 args.emplace_back(Operand(compare));
6487 if (has_derivs)
6488 args.emplace_back(Operand(derivs));
6489 args.emplace_back(Operand(coords));
6490 if (has_sample_index)
6491 args.emplace_back(Operand(sample_index));
6492 if (has_lod)
6493 args.emplace_back(lod);
6494
6495 Operand arg;
6496 if (args.size() > 1) {
6497 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6498 unsigned size = 0;
6499 for (unsigned i = 0; i < args.size(); i++) {
6500 size += args[i].size();
6501 vec->operands[i] = args[i];
6502 }
6503 RegClass rc = RegClass(RegType::vgpr, size);
6504 Temp tmp = bld.tmp(rc);
6505 vec->definitions[0] = Definition(tmp);
6506 ctx->block->instructions.emplace_back(std::move(vec));
6507 arg = Operand(tmp);
6508 } else {
6509 assert(args[0].isTemp());
6510 arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6511 }
6512
6513 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6514 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6515
6516 assert(coords.size() == 1);
6517 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6518 aco_opcode op;
6519 switch (last_bit) {
6520 case 1:
6521 op = aco_opcode::buffer_load_format_x; break;
6522 case 2:
6523 op = aco_opcode::buffer_load_format_xy; break;
6524 case 3:
6525 op = aco_opcode::buffer_load_format_xyz; break;
6526 case 4:
6527 op = aco_opcode::buffer_load_format_xyzw; break;
6528 default:
6529 unreachable("Tex instruction loads more than 4 components.");
6530 }
6531
6532 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6533 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6534 tmp_dst = dst;
6535 else
6536 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6537
6538 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6539 mubuf->operands[0] = Operand(coords);
6540 mubuf->operands[1] = Operand(resource);
6541 mubuf->operands[2] = Operand((uint32_t) 0);
6542 mubuf->definitions[0] = Definition(tmp_dst);
6543 mubuf->idxen = true;
6544 mubuf->can_reorder = true;
6545 ctx->block->instructions.emplace_back(std::move(mubuf));
6546
6547 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6548 return;
6549 }
6550
6551
6552 if (instr->op == nir_texop_txf ||
6553 instr->op == nir_texop_txf_ms ||
6554 instr->op == nir_texop_samples_identical) {
6555 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6556 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6557 tex->operands[0] = Operand(arg);
6558 tex->operands[1] = Operand(resource);
6559 tex->dim = dim;
6560 tex->dmask = dmask;
6561 tex->unrm = true;
6562 tex->da = da;
6563 tex->definitions[0] = Definition(tmp_dst);
6564 tex->can_reorder = true;
6565 ctx->block->instructions.emplace_back(std::move(tex));
6566
6567 if (instr->op == nir_texop_samples_identical) {
6568 assert(dmask == 1 && dst.regClass() == v1);
6569 assert(dst.id() != tmp_dst.id());
6570
6571 Temp tmp = bld.tmp(s2);
6572 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6573 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6574
6575 } else {
6576 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6577 }
6578 return;
6579 }
6580
6581 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6582 aco_opcode opcode = aco_opcode::image_sample;
6583 if (has_offset) { /* image_sample_*_o */
6584 if (has_compare) {
6585 opcode = aco_opcode::image_sample_c_o;
6586 if (has_derivs)
6587 opcode = aco_opcode::image_sample_c_d_o;
6588 if (has_bias)
6589 opcode = aco_opcode::image_sample_c_b_o;
6590 if (level_zero)
6591 opcode = aco_opcode::image_sample_c_lz_o;
6592 if (has_lod)
6593 opcode = aco_opcode::image_sample_c_l_o;
6594 } else {
6595 opcode = aco_opcode::image_sample_o;
6596 if (has_derivs)
6597 opcode = aco_opcode::image_sample_d_o;
6598 if (has_bias)
6599 opcode = aco_opcode::image_sample_b_o;
6600 if (level_zero)
6601 opcode = aco_opcode::image_sample_lz_o;
6602 if (has_lod)
6603 opcode = aco_opcode::image_sample_l_o;
6604 }
6605 } else { /* no offset */
6606 if (has_compare) {
6607 opcode = aco_opcode::image_sample_c;
6608 if (has_derivs)
6609 opcode = aco_opcode::image_sample_c_d;
6610 if (has_bias)
6611 opcode = aco_opcode::image_sample_c_b;
6612 if (level_zero)
6613 opcode = aco_opcode::image_sample_c_lz;
6614 if (has_lod)
6615 opcode = aco_opcode::image_sample_c_l;
6616 } else {
6617 opcode = aco_opcode::image_sample;
6618 if (has_derivs)
6619 opcode = aco_opcode::image_sample_d;
6620 if (has_bias)
6621 opcode = aco_opcode::image_sample_b;
6622 if (level_zero)
6623 opcode = aco_opcode::image_sample_lz;
6624 if (has_lod)
6625 opcode = aco_opcode::image_sample_l;
6626 }
6627 }
6628
6629 if (instr->op == nir_texop_tg4) {
6630 if (has_offset) {
6631 opcode = aco_opcode::image_gather4_lz_o;
6632 if (has_compare)
6633 opcode = aco_opcode::image_gather4_c_lz_o;
6634 } else {
6635 opcode = aco_opcode::image_gather4_lz;
6636 if (has_compare)
6637 opcode = aco_opcode::image_gather4_c_lz;
6638 }
6639 } else if (instr->op == nir_texop_lod) {
6640 opcode = aco_opcode::image_get_lod;
6641 }
6642
6643 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6644 tex->operands[0] = arg;
6645 tex->operands[1] = Operand(resource);
6646 tex->operands[2] = Operand(sampler);
6647 tex->dim = dim;
6648 tex->dmask = dmask;
6649 tex->da = da;
6650 tex->definitions[0] = Definition(tmp_dst);
6651 tex->can_reorder = true;
6652 ctx->block->instructions.emplace_back(std::move(tex));
6653
6654 if (tg4_integer_cube_workaround) {
6655 assert(tmp_dst.id() != dst.id());
6656 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6657
6658 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6659 Temp val[4];
6660 for (unsigned i = 0; i < dst.size(); i++) {
6661 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6662 Temp cvt_val;
6663 if (stype == GLSL_TYPE_UINT)
6664 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6665 else
6666 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6667 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6668 }
6669 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6670 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6671 val[0], val[1], val[2], val[3]);
6672 }
6673 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6674 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6675
6676 }
6677
6678
6679 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6680 {
6681 Temp tmp = get_ssa_temp(ctx, ssa);
6682 if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6683 return Operand(tmp.regClass());
6684 else
6685 return Operand(tmp);
6686 }
6687
6688 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6689 {
6690 aco_ptr<Pseudo_instruction> phi;
6691 unsigned num_src = exec_list_length(&instr->srcs);
6692 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6693
6694 aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6695
6696 std::map<unsigned, nir_ssa_def*> phi_src;
6697 bool all_undef = true;
6698 nir_foreach_phi_src(src, instr) {
6699 phi_src[src->pred->index] = src->src.ssa;
6700 if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6701 all_undef = false;
6702 }
6703 if (all_undef) {
6704 Builder bld(ctx->program, ctx->block);
6705 if (dst.regClass() == s1) {
6706 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6707 } else if (dst.regClass() == v1) {
6708 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6709 } else {
6710 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6711 for (unsigned i = 0; i < dst.size(); i++)
6712 vec->operands[i] = Operand(0u);
6713 vec->definitions[0] = Definition(dst);
6714 ctx->block->instructions.emplace_back(std::move(vec));
6715 }
6716 return;
6717 }
6718
6719 /* try to scalarize vector phis */
6720 if (dst.size() > 1) {
6721 // TODO: scalarize linear phis on divergent ifs
6722 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6723 std::array<Temp, 4> new_vec;
6724 for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6725 Operand src = get_phi_operand(ctx, pair.second);
6726 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6727 can_scalarize = false;
6728 break;
6729 }
6730 }
6731 if (can_scalarize) {
6732 unsigned num_components = instr->dest.ssa.num_components;
6733 assert(dst.size() % num_components == 0);
6734 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6735
6736 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6737 for (unsigned k = 0; k < num_components; k++) {
6738 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6739 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6740 for (unsigned i = 0; i < num_src; i++) {
6741 Operand src = get_phi_operand(ctx, it->second);
6742 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6743 ++it;
6744 }
6745 Temp phi_dst = {ctx->program->allocateId(), rc};
6746 phi->definitions[0] = Definition(phi_dst);
6747 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6748 new_vec[k] = phi_dst;
6749 vec->operands[k] = Operand(phi_dst);
6750 }
6751 vec->definitions[0] = Definition(dst);
6752 ctx->block->instructions.emplace_back(std::move(vec));
6753 ctx->allocated_vec.emplace(dst.id(), new_vec);
6754 return;
6755 }
6756 }
6757
6758 unsigned extra_src = 0;
6759 if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6760 ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6761 extra_src++;
6762 }
6763
6764 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6765
6766 /* if we have a linear phi on a divergent if, we know that one src is undef */
6767 if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6768 assert(extra_src == 0);
6769 Block* block;
6770 /* we place the phi either in the invert-block or in the current block */
6771 if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6772 assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6773 Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6774 block = &ctx->program->blocks[linear_else.linear_preds[0]];
6775 assert(block->kind & block_kind_invert);
6776 phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6777 } else {
6778 assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6779 block = ctx->block;
6780 phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6781 }
6782 phi->operands[1] = Operand(dst.regClass());
6783 phi->definitions[0] = Definition(dst);
6784 block->instructions.emplace(block->instructions.begin(), std::move(phi));
6785 return;
6786 }
6787
6788 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6789 for (unsigned i = 0; i < num_src; i++) {
6790 phi->operands[i] = get_phi_operand(ctx, it->second);
6791 ++it;
6792 }
6793 for (unsigned i = 0; i < extra_src; i++)
6794 phi->operands[num_src + i] = Operand(dst.regClass());
6795 phi->definitions[0] = Definition(dst);
6796 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6797 }
6798
6799
6800 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6801 {
6802 Temp dst = get_ssa_temp(ctx, &instr->def);
6803
6804 assert(dst.type() == RegType::sgpr);
6805
6806 if (dst.size() == 1) {
6807 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6808 } else {
6809 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6810 for (unsigned i = 0; i < dst.size(); i++)
6811 vec->operands[i] = Operand(0u);
6812 vec->definitions[0] = Definition(dst);
6813 ctx->block->instructions.emplace_back(std::move(vec));
6814 }
6815 }
6816
6817 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6818 {
6819 Builder bld(ctx->program, ctx->block);
6820 Block *logical_target;
6821 append_logical_end(ctx->block);
6822 unsigned idx = ctx->block->index;
6823
6824 switch (instr->type) {
6825 case nir_jump_break:
6826 logical_target = ctx->cf_info.parent_loop.exit;
6827 add_logical_edge(idx, logical_target);
6828 ctx->block->kind |= block_kind_break;
6829
6830 if (!ctx->cf_info.parent_if.is_divergent &&
6831 !ctx->cf_info.parent_loop.has_divergent_continue) {
6832 /* uniform break - directly jump out of the loop */
6833 ctx->block->kind |= block_kind_uniform;
6834 ctx->cf_info.has_branch = true;
6835 bld.branch(aco_opcode::p_branch);
6836 add_linear_edge(idx, logical_target);
6837 return;
6838 }
6839 ctx->cf_info.parent_loop.has_divergent_branch = true;
6840 break;
6841 case nir_jump_continue:
6842 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6843 add_logical_edge(idx, logical_target);
6844 ctx->block->kind |= block_kind_continue;
6845
6846 if (ctx->cf_info.parent_if.is_divergent) {
6847 /* for potential uniform breaks after this continue,
6848 we must ensure that they are handled correctly */
6849 ctx->cf_info.parent_loop.has_divergent_continue = true;
6850 ctx->cf_info.parent_loop.has_divergent_branch = true;
6851 } else {
6852 /* uniform continue - directly jump to the loop header */
6853 ctx->block->kind |= block_kind_uniform;
6854 ctx->cf_info.has_branch = true;
6855 bld.branch(aco_opcode::p_branch);
6856 add_linear_edge(idx, logical_target);
6857 return;
6858 }
6859 break;
6860 default:
6861 fprintf(stderr, "Unknown NIR jump instr: ");
6862 nir_print_instr(&instr->instr, stderr);
6863 fprintf(stderr, "\n");
6864 abort();
6865 }
6866
6867 /* remove critical edges from linear CFG */
6868 bld.branch(aco_opcode::p_branch);
6869 Block* break_block = ctx->program->create_and_insert_block();
6870 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6871 break_block->kind |= block_kind_uniform;
6872 add_linear_edge(idx, break_block);
6873 /* the loop_header pointer might be invalidated by this point */
6874 if (instr->type == nir_jump_continue)
6875 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6876 add_linear_edge(break_block->index, logical_target);
6877 bld.reset(break_block);
6878 bld.branch(aco_opcode::p_branch);
6879
6880 Block* continue_block = ctx->program->create_and_insert_block();
6881 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6882 add_linear_edge(idx, continue_block);
6883 append_logical_start(continue_block);
6884 ctx->block = continue_block;
6885 return;
6886 }
6887
6888 void visit_block(isel_context *ctx, nir_block *block)
6889 {
6890 nir_foreach_instr(instr, block) {
6891 switch (instr->type) {
6892 case nir_instr_type_alu:
6893 visit_alu_instr(ctx, nir_instr_as_alu(instr));
6894 break;
6895 case nir_instr_type_load_const:
6896 visit_load_const(ctx, nir_instr_as_load_const(instr));
6897 break;
6898 case nir_instr_type_intrinsic:
6899 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6900 break;
6901 case nir_instr_type_tex:
6902 visit_tex(ctx, nir_instr_as_tex(instr));
6903 break;
6904 case nir_instr_type_phi:
6905 visit_phi(ctx, nir_instr_as_phi(instr));
6906 break;
6907 case nir_instr_type_ssa_undef:
6908 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6909 break;
6910 case nir_instr_type_deref:
6911 break;
6912 case nir_instr_type_jump:
6913 visit_jump(ctx, nir_instr_as_jump(instr));
6914 break;
6915 default:
6916 fprintf(stderr, "Unknown NIR instr type: ");
6917 nir_print_instr(instr, stderr);
6918 fprintf(stderr, "\n");
6919 //abort();
6920 }
6921 }
6922 }
6923
6924
6925
6926 static void visit_loop(isel_context *ctx, nir_loop *loop)
6927 {
6928 append_logical_end(ctx->block);
6929 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6930 Builder bld(ctx->program, ctx->block);
6931 bld.branch(aco_opcode::p_branch);
6932 unsigned loop_preheader_idx = ctx->block->index;
6933
6934 Block loop_exit = Block();
6935 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
6936 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
6937
6938 Block* loop_header = ctx->program->create_and_insert_block();
6939 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
6940 loop_header->kind |= block_kind_loop_header;
6941 add_edge(loop_preheader_idx, loop_header);
6942 ctx->block = loop_header;
6943
6944 /* emit loop body */
6945 unsigned loop_header_idx = loop_header->index;
6946 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
6947 append_logical_start(ctx->block);
6948 visit_cf_list(ctx, &loop->body);
6949
6950 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
6951 if (!ctx->cf_info.has_branch) {
6952 append_logical_end(ctx->block);
6953 if (ctx->cf_info.exec_potentially_empty) {
6954 /* Discards can result in code running with an empty exec mask.
6955 * This would result in divergent breaks not ever being taken. As a
6956 * workaround, break the loop when the loop mask is empty instead of
6957 * always continuing. */
6958 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
6959
6960 /* create "loop_almost_exit" to avoid critical edges */
6961 unsigned block_idx = ctx->block->index;
6962 Block *loop_almost_exit = ctx->program->create_and_insert_block();
6963 loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6964 loop_almost_exit->kind = block_kind_uniform;
6965 bld.reset(loop_almost_exit);
6966 bld.branch(aco_opcode::p_branch);
6967
6968 add_linear_edge(block_idx, loop_almost_exit);
6969 add_linear_edge(loop_almost_exit->index, &loop_exit);
6970
6971 ctx->block = &ctx->program->blocks[block_idx];
6972 } else {
6973 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
6974 }
6975 if (!ctx->cf_info.parent_loop.has_divergent_branch)
6976 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6977 else
6978 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6979 bld.reset(ctx->block);
6980 bld.branch(aco_opcode::p_branch);
6981 }
6982
6983 /* fixup phis in loop header from unreachable blocks */
6984 if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
6985 bool linear = ctx->cf_info.has_branch;
6986 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
6987 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
6988 if ((logical && instr->opcode == aco_opcode::p_phi) ||
6989 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
6990 /* the last operand should be the one that needs to be removed */
6991 instr->operands.pop_back();
6992 } else if (!is_phi(instr)) {
6993 break;
6994 }
6995 }
6996 }
6997
6998 ctx->cf_info.has_branch = false;
6999
7000 // TODO: if the loop has not a single exit, we must add one °°
7001 /* emit loop successor block */
7002 ctx->block = ctx->program->insert_block(std::move(loop_exit));
7003 append_logical_start(ctx->block);
7004
7005 #if 0
7006 // TODO: check if it is beneficial to not branch on continues
7007 /* trim linear phis in loop header */
7008 for (auto&& instr : loop_entry->instructions) {
7009 if (instr->opcode == aco_opcode::p_linear_phi) {
7010 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7011 new_phi->definitions[0] = instr->definitions[0];
7012 for (unsigned i = 0; i < new_phi->operands.size(); i++)
7013 new_phi->operands[i] = instr->operands[i];
7014 /* check that the remaining operands are all the same */
7015 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7016 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7017 instr.swap(new_phi);
7018 } else if (instr->opcode == aco_opcode::p_phi) {
7019 continue;
7020 } else {
7021 break;
7022 }
7023 }
7024 #endif
7025 }
7026
7027 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7028 {
7029 ic->cond = cond;
7030
7031 append_logical_end(ctx->block);
7032 ctx->block->kind |= block_kind_branch;
7033
7034 /* branch to linear then block */
7035 assert(cond.regClass() == s2);
7036 aco_ptr<Pseudo_branch_instruction> branch;
7037 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7038 branch->operands[0] = Operand(cond);
7039 ctx->block->instructions.push_back(std::move(branch));
7040
7041 ic->BB_if_idx = ctx->block->index;
7042 ic->BB_invert = Block();
7043 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7044 /* Invert blocks are intentionally not marked as top level because they
7045 * are not part of the logical cfg. */
7046 ic->BB_invert.kind |= block_kind_invert;
7047 ic->BB_endif = Block();
7048 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7049 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7050
7051 ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7052 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7053 ctx->cf_info.parent_if.is_divergent = true;
7054 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7055
7056 /** emit logical then block */
7057 Block* BB_then_logical = ctx->program->create_and_insert_block();
7058 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7059 add_edge(ic->BB_if_idx, BB_then_logical);
7060 ctx->block = BB_then_logical;
7061 append_logical_start(BB_then_logical);
7062 }
7063
7064 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7065 {
7066 Block *BB_then_logical = ctx->block;
7067 append_logical_end(BB_then_logical);
7068 /* branch from logical then block to invert block */
7069 aco_ptr<Pseudo_branch_instruction> branch;
7070 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7071 BB_then_logical->instructions.emplace_back(std::move(branch));
7072 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7073 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7074 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7075 BB_then_logical->kind |= block_kind_uniform;
7076 assert(!ctx->cf_info.has_branch);
7077 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7078 ctx->cf_info.parent_loop.has_divergent_branch = false;
7079
7080 /** emit linear then block */
7081 Block* BB_then_linear = ctx->program->create_and_insert_block();
7082 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7083 BB_then_linear->kind |= block_kind_uniform;
7084 add_linear_edge(ic->BB_if_idx, BB_then_linear);
7085 /* branch from linear then block to invert block */
7086 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7087 BB_then_linear->instructions.emplace_back(std::move(branch));
7088 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7089
7090 /** emit invert merge block */
7091 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7092 ic->invert_idx = ctx->block->index;
7093
7094 /* branch to linear else block (skip else) */
7095 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7096 branch->operands[0] = Operand(ic->cond);
7097 ctx->block->instructions.push_back(std::move(branch));
7098
7099 ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7100 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7101
7102 /** emit logical else block */
7103 Block* BB_else_logical = ctx->program->create_and_insert_block();
7104 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7105 add_logical_edge(ic->BB_if_idx, BB_else_logical);
7106 add_linear_edge(ic->invert_idx, BB_else_logical);
7107 ctx->block = BB_else_logical;
7108 append_logical_start(BB_else_logical);
7109 }
7110
7111 static void end_divergent_if(isel_context *ctx, if_context *ic)
7112 {
7113 Block *BB_else_logical = ctx->block;
7114 append_logical_end(BB_else_logical);
7115
7116 /* branch from logical else block to endif block */
7117 aco_ptr<Pseudo_branch_instruction> branch;
7118 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7119 BB_else_logical->instructions.emplace_back(std::move(branch));
7120 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7121 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7122 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7123 BB_else_logical->kind |= block_kind_uniform;
7124
7125 assert(!ctx->cf_info.has_branch);
7126 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7127
7128
7129 /** emit linear else block */
7130 Block* BB_else_linear = ctx->program->create_and_insert_block();
7131 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7132 BB_else_linear->kind |= block_kind_uniform;
7133 add_linear_edge(ic->invert_idx, BB_else_linear);
7134
7135 /* branch from linear else block to endif block */
7136 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7137 BB_else_linear->instructions.emplace_back(std::move(branch));
7138 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7139
7140
7141 /** emit endif merge block */
7142 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7143 append_logical_start(ctx->block);
7144
7145
7146 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7147 ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7148 /* uniform control flow never has an empty exec-mask */
7149 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7150 ctx->cf_info.exec_potentially_empty = false;
7151 }
7152
7153 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7154 {
7155 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7156 Builder bld(ctx->program, ctx->block);
7157 aco_ptr<Pseudo_branch_instruction> branch;
7158
7159 if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7160 /**
7161 * Uniform conditionals are represented in the following way*) :
7162 *
7163 * The linear and logical CFG:
7164 * BB_IF
7165 * / \
7166 * BB_THEN (logical) BB_ELSE (logical)
7167 * \ /
7168 * BB_ENDIF
7169 *
7170 * *) Exceptions may be due to break and continue statements within loops
7171 * If a break/continue happens within uniform control flow, it branches
7172 * to the loop exit/entry block. Otherwise, it branches to the next
7173 * merge block.
7174 **/
7175 append_logical_end(ctx->block);
7176 ctx->block->kind |= block_kind_uniform;
7177
7178 /* emit branch */
7179 if (cond.regClass() == s2) {
7180 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7181 cond = as_uniform_bool(ctx, cond);
7182 }
7183 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7184 branch->operands[0] = Operand(cond);
7185 branch->operands[0].setFixed(scc);
7186 ctx->block->instructions.emplace_back(std::move(branch));
7187
7188 unsigned BB_if_idx = ctx->block->index;
7189 Block BB_endif = Block();
7190 BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7191 BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7192
7193 /** emit then block */
7194 Block* BB_then = ctx->program->create_and_insert_block();
7195 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7196 add_edge(BB_if_idx, BB_then);
7197 append_logical_start(BB_then);
7198 ctx->block = BB_then;
7199 visit_cf_list(ctx, &if_stmt->then_list);
7200 BB_then = ctx->block;
7201 bool then_branch = ctx->cf_info.has_branch;
7202 bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7203
7204 if (!then_branch) {
7205 append_logical_end(BB_then);
7206 /* branch from then block to endif block */
7207 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7208 BB_then->instructions.emplace_back(std::move(branch));
7209 add_linear_edge(BB_then->index, &BB_endif);
7210 if (!then_branch_divergent)
7211 add_logical_edge(BB_then->index, &BB_endif);
7212 BB_then->kind |= block_kind_uniform;
7213 }
7214
7215 ctx->cf_info.has_branch = false;
7216 ctx->cf_info.parent_loop.has_divergent_branch = false;
7217
7218 /** emit else block */
7219 Block* BB_else = ctx->program->create_and_insert_block();
7220 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7221 add_edge(BB_if_idx, BB_else);
7222 append_logical_start(BB_else);
7223 ctx->block = BB_else;
7224 visit_cf_list(ctx, &if_stmt->else_list);
7225 BB_else = ctx->block;
7226
7227 if (!ctx->cf_info.has_branch) {
7228 append_logical_end(BB_else);
7229 /* branch from then block to endif block */
7230 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7231 BB_else->instructions.emplace_back(std::move(branch));
7232 add_linear_edge(BB_else->index, &BB_endif);
7233 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7234 add_logical_edge(BB_else->index, &BB_endif);
7235 BB_else->kind |= block_kind_uniform;
7236 }
7237
7238 ctx->cf_info.has_branch &= then_branch;
7239 ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7240
7241 /** emit endif merge block */
7242 if (!ctx->cf_info.has_branch) {
7243 ctx->block = ctx->program->insert_block(std::move(BB_endif));
7244 append_logical_start(ctx->block);
7245 }
7246 } else { /* non-uniform condition */
7247 /**
7248 * To maintain a logical and linear CFG without critical edges,
7249 * non-uniform conditionals are represented in the following way*) :
7250 *
7251 * The linear CFG:
7252 * BB_IF
7253 * / \
7254 * BB_THEN (logical) BB_THEN (linear)
7255 * \ /
7256 * BB_INVERT (linear)
7257 * / \
7258 * BB_ELSE (logical) BB_ELSE (linear)
7259 * \ /
7260 * BB_ENDIF
7261 *
7262 * The logical CFG:
7263 * BB_IF
7264 * / \
7265 * BB_THEN (logical) BB_ELSE (logical)
7266 * \ /
7267 * BB_ENDIF
7268 *
7269 * *) Exceptions may be due to break and continue statements within loops
7270 **/
7271
7272 if_context ic;
7273
7274 begin_divergent_if_then(ctx, &ic, cond);
7275 visit_cf_list(ctx, &if_stmt->then_list);
7276
7277 begin_divergent_if_else(ctx, &ic);
7278 visit_cf_list(ctx, &if_stmt->else_list);
7279
7280 end_divergent_if(ctx, &ic);
7281 }
7282 }
7283
7284 static void visit_cf_list(isel_context *ctx,
7285 struct exec_list *list)
7286 {
7287 foreach_list_typed(nir_cf_node, node, node, list) {
7288 switch (node->type) {
7289 case nir_cf_node_block:
7290 visit_block(ctx, nir_cf_node_as_block(node));
7291 break;
7292 case nir_cf_node_if:
7293 visit_if(ctx, nir_cf_node_as_if(node));
7294 break;
7295 case nir_cf_node_loop:
7296 visit_loop(ctx, nir_cf_node_as_loop(node));
7297 break;
7298 default:
7299 unreachable("unimplemented cf list type");
7300 }
7301 }
7302 }
7303
7304 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7305 {
7306 int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7307 uint64_t mask = ctx->vs_output.mask[slot];
7308 if (!is_pos && !mask)
7309 return;
7310 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7311 return;
7312 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7313 exp->enabled_mask = mask;
7314 for (unsigned i = 0; i < 4; ++i) {
7315 if (mask & (1 << i))
7316 exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7317 else
7318 exp->operands[i] = Operand(v1);
7319 }
7320 exp->valid_mask = false;
7321 exp->done = false;
7322 exp->compressed = false;
7323 if (is_pos)
7324 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7325 else
7326 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7327 ctx->block->instructions.emplace_back(std::move(exp));
7328 }
7329
7330 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7331 {
7332 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7333 exp->enabled_mask = 0;
7334 for (unsigned i = 0; i < 4; ++i)
7335 exp->operands[i] = Operand(v1);
7336 if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7337 exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7338 exp->enabled_mask |= 0x1;
7339 }
7340 if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7341 exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7342 exp->enabled_mask |= 0x4;
7343 }
7344 if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7345 if (ctx->options->chip_class < GFX9) {
7346 exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7347 exp->enabled_mask |= 0x8;
7348 } else {
7349 Builder bld(ctx->program, ctx->block);
7350
7351 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7352 Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7353 if (exp->operands[2].isTemp())
7354 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7355
7356 exp->operands[2] = Operand(out);
7357 exp->enabled_mask |= 0x4;
7358 }
7359 }
7360 exp->valid_mask = false;
7361 exp->done = false;
7362 exp->compressed = false;
7363 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7364 ctx->block->instructions.emplace_back(std::move(exp));
7365 }
7366
7367 static void create_vs_exports(isel_context *ctx)
7368 {
7369 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7370
7371 if (outinfo->export_prim_id) {
7372 ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7373 ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7374 }
7375
7376 if (ctx->options->key.has_multiview_view_index) {
7377 ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7378 ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7379 }
7380
7381 /* the order these position exports are created is important */
7382 int next_pos = 0;
7383 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7384 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7385 export_vs_psiz_layer_viewport(ctx, &next_pos);
7386 }
7387 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7388 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7389 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7390 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7391
7392 if (ctx->options->key.vs_common_out.export_clip_dists) {
7393 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7394 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7395 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7396 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7397 }
7398
7399 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7400 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7401 i != VARYING_SLOT_PRIMITIVE_ID)
7402 continue;
7403
7404 export_vs_varying(ctx, i, false, NULL);
7405 }
7406 }
7407
7408 static void emit_stream_output(isel_context *ctx,
7409 Temp const *so_buffers,
7410 Temp const *so_write_offset,
7411 const struct radv_stream_output *output)
7412 {
7413 unsigned num_comps = util_bitcount(output->component_mask);
7414 unsigned loc = output->location;
7415 unsigned buf = output->buffer;
7416 unsigned offset = output->offset;
7417
7418 assert(num_comps && num_comps <= 4);
7419 if (!num_comps || num_comps > 4)
7420 return;
7421
7422 unsigned start = ffs(output->component_mask) - 1;
7423
7424 Temp out[4];
7425 bool all_undef = true;
7426 assert(ctx->stage == vertex_vs);
7427 for (unsigned i = 0; i < num_comps; i++) {
7428 out[i] = ctx->vs_output.outputs[loc][start + i];
7429 all_undef = all_undef && !out[i].id();
7430 }
7431 if (all_undef)
7432 return;
7433
7434 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7435 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7436 for (unsigned i = 0; i < num_comps; ++i)
7437 vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7438 vec->definitions[0] = Definition(write_data);
7439 ctx->block->instructions.emplace_back(std::move(vec));
7440
7441 aco_opcode opcode;
7442 switch (num_comps) {
7443 case 1:
7444 opcode = aco_opcode::buffer_store_dword;
7445 break;
7446 case 2:
7447 opcode = aco_opcode::buffer_store_dwordx2;
7448 break;
7449 case 3:
7450 opcode = aco_opcode::buffer_store_dwordx3;
7451 break;
7452 case 4:
7453 opcode = aco_opcode::buffer_store_dwordx4;
7454 break;
7455 }
7456
7457 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7458 store->operands[0] = Operand(so_write_offset[buf]);
7459 store->operands[1] = Operand(so_buffers[buf]);
7460 store->operands[2] = Operand((uint32_t) 0);
7461 store->operands[3] = Operand(write_data);
7462 if (offset > 4095) {
7463 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7464 Builder bld(ctx->program, ctx->block);
7465 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7466 } else {
7467 store->offset = offset;
7468 }
7469 store->offen = true;
7470 store->glc = true;
7471 store->dlc = false;
7472 store->slc = true;
7473 store->can_reorder = true;
7474 ctx->block->instructions.emplace_back(std::move(store));
7475 }
7476
7477 static void emit_streamout(isel_context *ctx, unsigned stream)
7478 {
7479 Builder bld(ctx->program, ctx->block);
7480
7481 Temp so_buffers[4];
7482 Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7483 for (unsigned i = 0; i < 4; i++) {
7484 unsigned stride = ctx->program->info->so.strides[i];
7485 if (!stride)
7486 continue;
7487
7488 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7489 }
7490
7491 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7492 ctx->streamout_config, Operand(0x70010u));
7493
7494 Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7495 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7496
7497 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7498
7499 if_context ic;
7500 begin_divergent_if_then(ctx, &ic, can_emit);
7501
7502 bld.reset(ctx->block);
7503
7504 Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7505
7506 Temp so_write_offset[4];
7507
7508 for (unsigned i = 0; i < 4; i++) {
7509 unsigned stride = ctx->program->info->so.strides[i];
7510 if (!stride)
7511 continue;
7512
7513 if (stride == 1) {
7514 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7515 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7516 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7517
7518 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7519 } else {
7520 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7521 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7522 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7523 }
7524 }
7525
7526 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7527 struct radv_stream_output *output =
7528 &ctx->program->info->so.outputs[i];
7529 if (stream != output->stream)
7530 continue;
7531
7532 emit_stream_output(ctx, so_buffers, so_write_offset, output);
7533 }
7534
7535 begin_divergent_if_else(ctx, &ic);
7536 end_divergent_if(ctx, &ic);
7537 }
7538
7539 } /* end namespace */
7540
7541 void handle_bc_optimize(isel_context *ctx)
7542 {
7543 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7544 Builder bld(ctx->program, ctx->block);
7545 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7546 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7547 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7548 if (uses_center && uses_centroid) {
7549 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7550
7551 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7552 for (unsigned i = 0; i < 2; i++) {
7553 Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7554 ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7555 ctx->fs_inputs[fs_input::persp_center_p1 + i],
7556 sel);
7557 ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7558 }
7559 }
7560
7561 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7562 for (unsigned i = 0; i < 2; i++) {
7563 Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7564 ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7565 ctx->fs_inputs[fs_input::linear_center_p1 + i],
7566 sel);
7567 ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7568 }
7569 }
7570 }
7571 }
7572
7573 void select_program(Program *program,
7574 unsigned shader_count,
7575 struct nir_shader *const *shaders,
7576 ac_shader_config* config,
7577 struct radv_shader_info *info,
7578 struct radv_nir_compiler_options *options)
7579 {
7580 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7581
7582 for (unsigned i = 0; i < shader_count; i++) {
7583 nir_shader *nir = shaders[i];
7584 init_context(&ctx, nir);
7585
7586 if (!i) {
7587 add_startpgm(&ctx); /* needs to be after init_context() for FS */
7588 append_logical_start(ctx.block);
7589 }
7590
7591 if_context ic;
7592 if (shader_count >= 2) {
7593 Builder bld(ctx.program, ctx.block);
7594 Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7595 Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7596 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7597 Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7598
7599 begin_divergent_if_then(&ctx, &ic, cond);
7600 }
7601
7602 if (i) {
7603 Builder bld(ctx.program, ctx.block);
7604 bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7605 bld.sopp(aco_opcode::s_barrier);
7606 }
7607
7608 if (ctx.stage == fragment_fs)
7609 handle_bc_optimize(&ctx);
7610
7611 nir_function_impl *func = nir_shader_get_entrypoint(nir);
7612 visit_cf_list(&ctx, &func->body);
7613
7614 if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7615 emit_streamout(&ctx, 0);
7616
7617 if (ctx.stage == vertex_vs)
7618 create_vs_exports(&ctx);
7619
7620 if (shader_count >= 2) {
7621 begin_divergent_if_else(&ctx, &ic);
7622 end_divergent_if(&ctx, &ic);
7623 }
7624
7625 ralloc_free(ctx.divergent_vals);
7626 }
7627
7628 append_logical_end(ctx.block);
7629 ctx.block->kind |= block_kind_uniform;
7630 Builder bld(ctx.program, ctx.block);
7631 if (ctx.program->wb_smem_l1_on_end)
7632 bld.smem(aco_opcode::s_dcache_wb, false);
7633 bld.sopp(aco_opcode::s_endpgm);
7634
7635 /* cleanup CFG */
7636 for (Block& BB : program->blocks) {
7637 for (unsigned idx : BB.linear_preds)
7638 program->blocks[idx].linear_succs.emplace_back(BB.index);
7639 for (unsigned idx : BB.logical_preds)
7640 program->blocks[idx].logical_succs.emplace_back(BB.index);
7641 }
7642 }
7643 }