aco: use ds_read2_b64/ds_write2_b64
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
127 {
128 Builder bld(ctx->program, ctx->block);
129
130 if (!dst.id())
131 dst = bld.tmp(src.regClass());
132
133 if (ctx->stage != fragment_fs) {
134 if (!dst.id())
135 return src;
136
137 if (src.type() == RegType::vgpr || src.size() > 1)
138 bld.copy(Definition(dst), src);
139 else
140 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
141 return dst;
142 }
143
144 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
145 ctx->program->needs_wqm |= program_needs_wqm;
146 return dst;
147 }
148
149 Temp as_vgpr(isel_context *ctx, Temp val)
150 {
151 if (val.type() == RegType::sgpr) {
152 Builder bld(ctx->program, ctx->block);
153 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
154 }
155 assert(val.type() == RegType::vgpr);
156 return val;
157 }
158
159 //assumes a != 0xffffffff
160 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
161 {
162 assert(b != 0);
163 Builder bld(ctx->program, ctx->block);
164
165 if (util_is_power_of_two_or_zero(b)) {
166 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
167 return;
168 }
169
170 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
171
172 assert(info.multiplier <= 0xffffffff);
173
174 bool pre_shift = info.pre_shift != 0;
175 bool increment = info.increment != 0;
176 bool multiply = true;
177 bool post_shift = info.post_shift != 0;
178
179 if (!pre_shift && !increment && !multiply && !post_shift) {
180 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
181 return;
182 }
183
184 Temp pre_shift_dst = a;
185 if (pre_shift) {
186 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
187 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
188 }
189
190 Temp increment_dst = pre_shift_dst;
191 if (increment) {
192 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
193 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
194 }
195
196 Temp multiply_dst = increment_dst;
197 if (multiply) {
198 multiply_dst = post_shift ? bld.tmp(v1) : dst;
199 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
200 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
201 }
202
203 if (post_shift) {
204 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
205 }
206 }
207
208 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
209 {
210 Builder bld(ctx->program, ctx->block);
211 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
212 }
213
214
215 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
216 {
217 /* no need to extract the whole vector */
218 if (src.regClass() == dst_rc) {
219 assert(idx == 0);
220 return src;
221 }
222 assert(src.size() > idx);
223 Builder bld(ctx->program, ctx->block);
224 auto it = ctx->allocated_vec.find(src.id());
225 /* the size check needs to be early because elements other than 0 may be garbage */
226 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
227 if (it->second[idx].regClass() == dst_rc) {
228 return it->second[idx];
229 } else {
230 assert(dst_rc.size() == it->second[idx].regClass().size());
231 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
232 return bld.copy(bld.def(dst_rc), it->second[idx]);
233 }
234 }
235
236 if (src.size() == dst_rc.size()) {
237 assert(idx == 0);
238 return bld.copy(bld.def(dst_rc), src);
239 } else {
240 Temp dst = bld.tmp(dst_rc);
241 emit_extract_vector(ctx, src, idx, dst);
242 return dst;
243 }
244 }
245
246 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
247 {
248 if (num_components == 1)
249 return;
250 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
251 return;
252 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
253 split->operands[0] = Operand(vec_src);
254 std::array<Temp,4> elems;
255 for (unsigned i = 0; i < num_components; i++) {
256 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
257 split->definitions[i] = Definition(elems[i]);
258 }
259 ctx->block->instructions.emplace_back(std::move(split));
260 ctx->allocated_vec.emplace(vec_src.id(), elems);
261 }
262
263 /* This vector expansion uses a mask to determine which elements in the new vector
264 * come from the original vector. The other elements are undefined. */
265 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
266 {
267 emit_split_vector(ctx, vec_src, util_bitcount(mask));
268
269 if (vec_src == dst)
270 return;
271
272 Builder bld(ctx->program, ctx->block);
273 if (num_components == 1) {
274 if (dst.type() == RegType::sgpr)
275 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
276 else
277 bld.copy(Definition(dst), vec_src);
278 return;
279 }
280
281 unsigned component_size = dst.size() / num_components;
282 std::array<Temp,4> elems;
283
284 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
285 vec->definitions[0] = Definition(dst);
286 unsigned k = 0;
287 for (unsigned i = 0; i < num_components; i++) {
288 if (mask & (1 << i)) {
289 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
290 if (dst.type() == RegType::sgpr)
291 src = bld.as_uniform(src);
292 vec->operands[i] = Operand(src);
293 } else {
294 vec->operands[i] = Operand(0u);
295 }
296 elems[i] = vec->operands[i].getTemp();
297 }
298 ctx->block->instructions.emplace_back(std::move(vec));
299 ctx->allocated_vec.emplace(dst.id(), elems);
300 }
301
302 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
303 {
304 if (val.regClass() == s2) {
305 return val;
306 } else {
307 assert(val.regClass() == s1);
308 Builder bld(ctx->program, ctx->block);
309 Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
310 Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
311 if (vcc_hint)
312 def.setHint(vcc);
313 return def.getTemp();
314 }
315 }
316
317 Temp as_uniform_bool(isel_context *ctx, Temp val)
318 {
319 if (val.regClass() == s1) {
320 return val;
321 } else {
322 assert(val.regClass() == s2);
323 Builder bld(ctx->program, ctx->block);
324 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
325 return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), emit_wqm(ctx, val));
326 }
327 }
328
329 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
330 {
331 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
332 return get_ssa_temp(ctx, src.src.ssa);
333
334 if (src.src.ssa->num_components == size) {
335 bool identity_swizzle = true;
336 for (unsigned i = 0; identity_swizzle && i < size; i++) {
337 if (src.swizzle[i] != i)
338 identity_swizzle = false;
339 }
340 if (identity_swizzle)
341 return get_ssa_temp(ctx, src.src.ssa);
342 }
343
344 Temp vec = get_ssa_temp(ctx, src.src.ssa);
345 unsigned elem_size = vec.size() / src.src.ssa->num_components;
346 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
347 assert(vec.size() % elem_size == 0);
348
349 RegClass elem_rc = RegClass(vec.type(), elem_size);
350 if (size == 1) {
351 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
352 } else {
353 assert(size <= 4);
354 std::array<Temp,4> elems;
355 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
356 for (unsigned i = 0; i < size; ++i) {
357 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
358 vec_instr->operands[i] = Operand{elems[i]};
359 }
360 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
361 vec_instr->definitions[0] = Definition(dst);
362 ctx->block->instructions.emplace_back(std::move(vec_instr));
363 ctx->allocated_vec.emplace(dst.id(), elems);
364 return dst;
365 }
366 }
367
368 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
369 {
370 if (ptr.size() == 2)
371 return ptr;
372 Builder bld(ctx->program, ctx->block);
373 if (ptr.type() == RegType::vgpr)
374 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
375 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
376 ptr, Operand((unsigned)ctx->options->address32_hi));
377 }
378
379 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
380 {
381 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
382 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
383 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
384 sop2->definitions[0] = Definition(dst);
385 if (writes_scc)
386 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
387 ctx->block->instructions.emplace_back(std::move(sop2));
388 }
389
390 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
391 {
392 Builder bld(ctx->program, ctx->block);
393 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
394 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
395 if (src1.type() == RegType::sgpr) {
396 if (commutative && src0.type() == RegType::vgpr) {
397 Temp t = src0;
398 src0 = src1;
399 src1 = t;
400 } else if (src0.type() == RegType::vgpr &&
401 op != aco_opcode::v_madmk_f32 &&
402 op != aco_opcode::v_madak_f32 &&
403 op != aco_opcode::v_madmk_f16 &&
404 op != aco_opcode::v_madak_f16) {
405 /* If the instruction is not commutative, we emit a VOP3A instruction */
406 bld.vop2_e64(op, Definition(dst), src0, src1);
407 return;
408 } else {
409 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
410 }
411 }
412 bld.vop2(op, Definition(dst), src0, src1);
413 }
414
415 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
416 {
417 Temp src0 = get_alu_src(ctx, instr->src[0]);
418 Temp src1 = get_alu_src(ctx, instr->src[1]);
419 Temp src2 = get_alu_src(ctx, instr->src[2]);
420
421 /* ensure that the instruction has at most 1 sgpr operand
422 * The optimizer will inline constants for us */
423 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
424 src0 = as_vgpr(ctx, src0);
425 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
426 src1 = as_vgpr(ctx, src1);
427 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
428 src2 = as_vgpr(ctx, src2);
429
430 Builder bld(ctx->program, ctx->block);
431 bld.vop3(op, Definition(dst), src0, src1, src2);
432 }
433
434 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
435 {
436 Builder bld(ctx->program, ctx->block);
437 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
438 }
439
440 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
441 {
442 Temp src0 = get_alu_src(ctx, instr->src[0]);
443 Temp src1 = get_alu_src(ctx, instr->src[1]);
444 aco_ptr<Instruction> vopc;
445 if (src1.type() == RegType::sgpr) {
446 if (src0.type() == RegType::vgpr) {
447 /* to swap the operands, we might also have to change the opcode */
448 switch (op) {
449 case aco_opcode::v_cmp_lt_f32:
450 op = aco_opcode::v_cmp_gt_f32;
451 break;
452 case aco_opcode::v_cmp_ge_f32:
453 op = aco_opcode::v_cmp_le_f32;
454 break;
455 case aco_opcode::v_cmp_lt_i32:
456 op = aco_opcode::v_cmp_gt_i32;
457 break;
458 case aco_opcode::v_cmp_ge_i32:
459 op = aco_opcode::v_cmp_le_i32;
460 break;
461 case aco_opcode::v_cmp_lt_u32:
462 op = aco_opcode::v_cmp_gt_u32;
463 break;
464 case aco_opcode::v_cmp_ge_u32:
465 op = aco_opcode::v_cmp_le_u32;
466 break;
467 case aco_opcode::v_cmp_lt_f64:
468 op = aco_opcode::v_cmp_gt_f64;
469 break;
470 case aco_opcode::v_cmp_ge_f64:
471 op = aco_opcode::v_cmp_le_f64;
472 break;
473 case aco_opcode::v_cmp_lt_i64:
474 op = aco_opcode::v_cmp_gt_i64;
475 break;
476 case aco_opcode::v_cmp_ge_i64:
477 op = aco_opcode::v_cmp_le_i64;
478 break;
479 case aco_opcode::v_cmp_lt_u64:
480 op = aco_opcode::v_cmp_gt_u64;
481 break;
482 case aco_opcode::v_cmp_ge_u64:
483 op = aco_opcode::v_cmp_le_u64;
484 break;
485 default: /* eq and ne are commutative */
486 break;
487 }
488 Temp t = src0;
489 src0 = src1;
490 src1 = t;
491 } else {
492 src1 = as_vgpr(ctx, src1);
493 }
494 }
495 Builder bld(ctx->program, ctx->block);
496 bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
497 }
498
499 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
500 {
501 if (dst.regClass() == s2) {
502 emit_vopc_instruction(ctx, instr, op, dst);
503 if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
504 emit_split_vector(ctx, dst, 2);
505 } else if (dst.regClass() == s1) {
506 Temp src0 = get_alu_src(ctx, instr->src[0]);
507 Temp src1 = get_alu_src(ctx, instr->src[1]);
508 assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
509
510 Builder bld(ctx->program, ctx->block);
511 bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
512
513 } else {
514 assert(false);
515 }
516 }
517
518 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
519 {
520 Builder bld(ctx->program, ctx->block);
521 Temp src0 = get_alu_src(ctx, instr->src[0]);
522 Temp src1 = get_alu_src(ctx, instr->src[1]);
523 if (dst.regClass() == s2) {
524 bld.sop2(op64, Definition(dst), bld.def(s1, scc),
525 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
526 } else {
527 assert(dst.regClass() == s1);
528 bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
529 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
530 }
531 }
532
533
534 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
535 {
536 Builder bld(ctx->program, ctx->block);
537 Temp cond = get_alu_src(ctx, instr->src[0]);
538 Temp then = get_alu_src(ctx, instr->src[1]);
539 Temp els = get_alu_src(ctx, instr->src[2]);
540
541 if (dst.type() == RegType::vgpr) {
542 cond = as_divergent_bool(ctx, cond, true);
543
544 aco_ptr<Instruction> bcsel;
545 if (dst.size() == 1) {
546 then = as_vgpr(ctx, then);
547 els = as_vgpr(ctx, els);
548
549 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
550 } else if (dst.size() == 2) {
551 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
552 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
553 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
554 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
555
556 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
557 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
558
559 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
560 } else {
561 fprintf(stderr, "Unimplemented NIR instr bit size: ");
562 nir_print_instr(&instr->instr, stderr);
563 fprintf(stderr, "\n");
564 }
565 return;
566 }
567
568 if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
569 if (dst.regClass() == s1 || dst.regClass() == s2) {
570 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
571 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
572 bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
573 } else {
574 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
575 nir_print_instr(&instr->instr, stderr);
576 fprintf(stderr, "\n");
577 }
578 return;
579 }
580
581 /* boolean bcsel */
582 assert(instr->dest.dest.ssa.bit_size == 1);
583
584 if (dst.regClass() == s1)
585 cond = as_uniform_bool(ctx, cond);
586
587 if (cond.regClass() == s1) { /* uniform selection */
588 aco_opcode op;
589 if (dst.regClass() == s2) {
590 op = aco_opcode::s_cselect_b64;
591 then = as_divergent_bool(ctx, then, false);
592 els = as_divergent_bool(ctx, els, false);
593 } else {
594 assert(dst.regClass() == s1);
595 op = aco_opcode::s_cselect_b32;
596 then = as_uniform_bool(ctx, then);
597 els = as_uniform_bool(ctx, els);
598 }
599 bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
600 return;
601 }
602
603 /* divergent boolean bcsel
604 * this implements bcsel on bools: dst = s0 ? s1 : s2
605 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
606 assert (dst.regClass() == s2);
607 then = as_divergent_bool(ctx, then, false);
608 els = as_divergent_bool(ctx, els, false);
609
610 if (cond.id() != then.id())
611 then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
612
613 if (cond.id() == els.id())
614 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
615 else
616 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
617 bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
618 }
619
620 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
621 {
622 if (!instr->dest.dest.is_ssa) {
623 fprintf(stderr, "nir alu dst not in ssa: ");
624 nir_print_instr(&instr->instr, stderr);
625 fprintf(stderr, "\n");
626 abort();
627 }
628 Builder bld(ctx->program, ctx->block);
629 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
630 switch(instr->op) {
631 case nir_op_vec2:
632 case nir_op_vec3:
633 case nir_op_vec4: {
634 std::array<Temp,4> elems;
635 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
636 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
637 elems[i] = get_alu_src(ctx, instr->src[i]);
638 vec->operands[i] = Operand{elems[i]};
639 }
640 vec->definitions[0] = Definition(dst);
641 ctx->block->instructions.emplace_back(std::move(vec));
642 ctx->allocated_vec.emplace(dst.id(), elems);
643 break;
644 }
645 case nir_op_mov: {
646 Temp src = get_alu_src(ctx, instr->src[0]);
647 aco_ptr<Instruction> mov;
648 if (dst.type() == RegType::sgpr) {
649 if (src.type() == RegType::vgpr)
650 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
651 else if (src.regClass() == s1)
652 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
653 else if (src.regClass() == s2)
654 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
655 else
656 unreachable("wrong src register class for nir_op_imov");
657 } else if (dst.regClass() == v1) {
658 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
659 } else if (dst.regClass() == v2) {
660 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
661 } else {
662 nir_print_instr(&instr->instr, stderr);
663 unreachable("Should have been lowered to scalar.");
664 }
665 break;
666 }
667 case nir_op_inot: {
668 Temp src = get_alu_src(ctx, instr->src[0]);
669 /* uniform booleans */
670 if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
671 if (src.regClass() == s1) {
672 /* in this case, src is either 1 or 0 */
673 bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
674 } else {
675 /* src is either exec_mask or 0 */
676 assert(src.regClass() == s2);
677 bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
678 }
679 } else if (dst.regClass() == v1) {
680 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
681 } else if (dst.type() == RegType::sgpr) {
682 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
683 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
684 } else {
685 fprintf(stderr, "Unimplemented NIR instr bit size: ");
686 nir_print_instr(&instr->instr, stderr);
687 fprintf(stderr, "\n");
688 }
689 break;
690 }
691 case nir_op_ineg: {
692 Temp src = get_alu_src(ctx, instr->src[0]);
693 if (dst.regClass() == v1) {
694 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
695 } else if (dst.regClass() == s1) {
696 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
697 } else if (dst.size() == 2) {
698 Temp src0 = bld.tmp(dst.type(), 1);
699 Temp src1 = bld.tmp(dst.type(), 1);
700 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
701
702 if (dst.regClass() == s2) {
703 Temp carry = bld.tmp(s1);
704 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
705 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
706 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
707 } else {
708 Temp lower = bld.tmp(v1);
709 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
710 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
711 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
712 }
713 } else {
714 fprintf(stderr, "Unimplemented NIR instr bit size: ");
715 nir_print_instr(&instr->instr, stderr);
716 fprintf(stderr, "\n");
717 }
718 break;
719 }
720 case nir_op_iabs: {
721 if (dst.regClass() == s1) {
722 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
723 } else if (dst.regClass() == v1) {
724 Temp src = get_alu_src(ctx, instr->src[0]);
725 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
726 } else {
727 fprintf(stderr, "Unimplemented NIR instr bit size: ");
728 nir_print_instr(&instr->instr, stderr);
729 fprintf(stderr, "\n");
730 }
731 break;
732 }
733 case nir_op_isign: {
734 Temp src = get_alu_src(ctx, instr->src[0]);
735 if (dst.regClass() == s1) {
736 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
737 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
738 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
739 } else if (dst.regClass() == s2) {
740 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
741 Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
742 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
743 } else if (dst.regClass() == v1) {
744 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
745 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
746 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
747 } else if (dst.regClass() == v2) {
748 Temp upper = emit_extract_vector(ctx, src, 1, v1);
749 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
750 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
751 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
752 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
753 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
754 } else {
755 fprintf(stderr, "Unimplemented NIR instr bit size: ");
756 nir_print_instr(&instr->instr, stderr);
757 fprintf(stderr, "\n");
758 }
759 break;
760 }
761 case nir_op_imax: {
762 if (dst.regClass() == v1) {
763 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
764 } else if (dst.regClass() == s1) {
765 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
766 } else {
767 fprintf(stderr, "Unimplemented NIR instr bit size: ");
768 nir_print_instr(&instr->instr, stderr);
769 fprintf(stderr, "\n");
770 }
771 break;
772 }
773 case nir_op_umax: {
774 if (dst.regClass() == v1) {
775 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
776 } else if (dst.regClass() == s1) {
777 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
778 } else {
779 fprintf(stderr, "Unimplemented NIR instr bit size: ");
780 nir_print_instr(&instr->instr, stderr);
781 fprintf(stderr, "\n");
782 }
783 break;
784 }
785 case nir_op_imin: {
786 if (dst.regClass() == v1) {
787 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
788 } else if (dst.regClass() == s1) {
789 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
790 } else {
791 fprintf(stderr, "Unimplemented NIR instr bit size: ");
792 nir_print_instr(&instr->instr, stderr);
793 fprintf(stderr, "\n");
794 }
795 break;
796 }
797 case nir_op_umin: {
798 if (dst.regClass() == v1) {
799 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
800 } else if (dst.regClass() == s1) {
801 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
802 } else {
803 fprintf(stderr, "Unimplemented NIR instr bit size: ");
804 nir_print_instr(&instr->instr, stderr);
805 fprintf(stderr, "\n");
806 }
807 break;
808 }
809 case nir_op_ior: {
810 if (instr->dest.dest.ssa.bit_size == 1) {
811 emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
812 } else if (dst.regClass() == v1) {
813 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
814 } else if (dst.regClass() == s1) {
815 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
816 } else if (dst.regClass() == s2) {
817 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
818 } else {
819 fprintf(stderr, "Unimplemented NIR instr bit size: ");
820 nir_print_instr(&instr->instr, stderr);
821 fprintf(stderr, "\n");
822 }
823 break;
824 }
825 case nir_op_iand: {
826 if (instr->dest.dest.ssa.bit_size == 1) {
827 emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
828 } else if (dst.regClass() == v1) {
829 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
830 } else if (dst.regClass() == s1) {
831 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
832 } else if (dst.regClass() == s2) {
833 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
834 } else {
835 fprintf(stderr, "Unimplemented NIR instr bit size: ");
836 nir_print_instr(&instr->instr, stderr);
837 fprintf(stderr, "\n");
838 }
839 break;
840 }
841 case nir_op_ixor: {
842 if (instr->dest.dest.ssa.bit_size == 1) {
843 emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
844 } else if (dst.regClass() == v1) {
845 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
846 } else if (dst.regClass() == s1) {
847 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
848 } else if (dst.regClass() == s2) {
849 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
850 } else {
851 fprintf(stderr, "Unimplemented NIR instr bit size: ");
852 nir_print_instr(&instr->instr, stderr);
853 fprintf(stderr, "\n");
854 }
855 break;
856 }
857 case nir_op_ushr: {
858 if (dst.regClass() == v1) {
859 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
860 } else if (dst.regClass() == v2) {
861 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
862 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
863 } else if (dst.regClass() == s2) {
864 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
865 } else if (dst.regClass() == s1) {
866 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
867 } else {
868 fprintf(stderr, "Unimplemented NIR instr bit size: ");
869 nir_print_instr(&instr->instr, stderr);
870 fprintf(stderr, "\n");
871 }
872 break;
873 }
874 case nir_op_ishl: {
875 if (dst.regClass() == v1) {
876 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
877 } else if (dst.regClass() == v2) {
878 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
879 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
880 } else if (dst.regClass() == s1) {
881 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
882 } else if (dst.regClass() == s2) {
883 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
884 } else {
885 fprintf(stderr, "Unimplemented NIR instr bit size: ");
886 nir_print_instr(&instr->instr, stderr);
887 fprintf(stderr, "\n");
888 }
889 break;
890 }
891 case nir_op_ishr: {
892 if (dst.regClass() == v1) {
893 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
894 } else if (dst.regClass() == v2) {
895 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
896 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
897 } else if (dst.regClass() == s1) {
898 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
899 } else if (dst.regClass() == s2) {
900 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
901 } else {
902 fprintf(stderr, "Unimplemented NIR instr bit size: ");
903 nir_print_instr(&instr->instr, stderr);
904 fprintf(stderr, "\n");
905 }
906 break;
907 }
908 case nir_op_find_lsb: {
909 Temp src = get_alu_src(ctx, instr->src[0]);
910 if (src.regClass() == s1) {
911 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
912 } else if (src.regClass() == v1) {
913 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
914 } else if (src.regClass() == s2) {
915 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
916 } else {
917 fprintf(stderr, "Unimplemented NIR instr bit size: ");
918 nir_print_instr(&instr->instr, stderr);
919 fprintf(stderr, "\n");
920 }
921 break;
922 }
923 case nir_op_ufind_msb:
924 case nir_op_ifind_msb: {
925 Temp src = get_alu_src(ctx, instr->src[0]);
926 if (src.regClass() == s1 || src.regClass() == s2) {
927 aco_opcode op = src.regClass() == s2 ?
928 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
929 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
930 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
931
932 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
933 Operand(src.size() * 32u - 1u), msb_rev);
934 Temp msb = sub.def(0).getTemp();
935 Temp carry = sub.def(1).getTemp();
936
937 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
938 } else if (src.regClass() == v1) {
939 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
940 Temp msb_rev = bld.tmp(v1);
941 emit_vop1_instruction(ctx, instr, op, msb_rev);
942 Temp msb = bld.tmp(v1);
943 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
944 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
945 } else {
946 fprintf(stderr, "Unimplemented NIR instr bit size: ");
947 nir_print_instr(&instr->instr, stderr);
948 fprintf(stderr, "\n");
949 }
950 break;
951 }
952 case nir_op_bitfield_reverse: {
953 if (dst.regClass() == s1) {
954 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
955 } else if (dst.regClass() == v1) {
956 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
957 } else {
958 fprintf(stderr, "Unimplemented NIR instr bit size: ");
959 nir_print_instr(&instr->instr, stderr);
960 fprintf(stderr, "\n");
961 }
962 break;
963 }
964 case nir_op_iadd: {
965 if (dst.regClass() == s1) {
966 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
967 break;
968 }
969
970 Temp src0 = get_alu_src(ctx, instr->src[0]);
971 Temp src1 = get_alu_src(ctx, instr->src[1]);
972 if (dst.regClass() == v1) {
973 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
974 break;
975 }
976
977 assert(src0.size() == 2 && src1.size() == 2);
978 Temp src00 = bld.tmp(src0.type(), 1);
979 Temp src01 = bld.tmp(dst.type(), 1);
980 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
981 Temp src10 = bld.tmp(src1.type(), 1);
982 Temp src11 = bld.tmp(dst.type(), 1);
983 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
984
985 if (dst.regClass() == s2) {
986 Temp carry = bld.tmp(s1);
987 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
988 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
989 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
990 } else if (dst.regClass() == v2) {
991 Temp dst0 = bld.tmp(v1);
992 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
993 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
994 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
995 } else {
996 fprintf(stderr, "Unimplemented NIR instr bit size: ");
997 nir_print_instr(&instr->instr, stderr);
998 fprintf(stderr, "\n");
999 }
1000 break;
1001 }
1002 case nir_op_uadd_sat: {
1003 Temp src0 = get_alu_src(ctx, instr->src[0]);
1004 Temp src1 = get_alu_src(ctx, instr->src[1]);
1005 if (dst.regClass() == s1) {
1006 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1007 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1008 src0, src1);
1009 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1010 } else if (dst.regClass() == v1) {
1011 if (ctx->options->chip_class >= GFX9) {
1012 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1013 add->operands[0] = Operand(src0);
1014 add->operands[1] = Operand(src1);
1015 add->definitions[0] = Definition(dst);
1016 add->clamp = 1;
1017 ctx->block->instructions.emplace_back(std::move(add));
1018 } else {
1019 if (src1.regClass() != v1)
1020 std::swap(src0, src1);
1021 assert(src1.regClass() == v1);
1022 Temp tmp = bld.tmp(v1);
1023 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1024 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1025 }
1026 } else {
1027 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1028 nir_print_instr(&instr->instr, stderr);
1029 fprintf(stderr, "\n");
1030 }
1031 break;
1032 }
1033 case nir_op_uadd_carry: {
1034 Temp src0 = get_alu_src(ctx, instr->src[0]);
1035 Temp src1 = get_alu_src(ctx, instr->src[1]);
1036 if (dst.regClass() == s1) {
1037 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1038 break;
1039 }
1040 if (dst.regClass() == v1) {
1041 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1042 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1043 break;
1044 }
1045
1046 Temp src00 = bld.tmp(src0.type(), 1);
1047 Temp src01 = bld.tmp(dst.type(), 1);
1048 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1049 Temp src10 = bld.tmp(src1.type(), 1);
1050 Temp src11 = bld.tmp(dst.type(), 1);
1051 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1052 if (dst.regClass() == s2) {
1053 Temp carry = bld.tmp(s1);
1054 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1055 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1056 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1057 } else if (dst.regClass() == v2) {
1058 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1059 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1060 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1061 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1062 } else {
1063 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1064 nir_print_instr(&instr->instr, stderr);
1065 fprintf(stderr, "\n");
1066 }
1067 break;
1068 }
1069 case nir_op_isub: {
1070 if (dst.regClass() == s1) {
1071 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1072 break;
1073 }
1074
1075 Temp src0 = get_alu_src(ctx, instr->src[0]);
1076 Temp src1 = get_alu_src(ctx, instr->src[1]);
1077 if (dst.regClass() == v1) {
1078 bld.vsub32(Definition(dst), src0, src1);
1079 break;
1080 }
1081
1082 Temp src00 = bld.tmp(src0.type(), 1);
1083 Temp src01 = bld.tmp(dst.type(), 1);
1084 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1085 Temp src10 = bld.tmp(src1.type(), 1);
1086 Temp src11 = bld.tmp(dst.type(), 1);
1087 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1088 if (dst.regClass() == s2) {
1089 Temp carry = bld.tmp(s1);
1090 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1091 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1092 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1093 } else if (dst.regClass() == v2) {
1094 Temp lower = bld.tmp(v1);
1095 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1096 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1097 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1098 } else {
1099 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1100 nir_print_instr(&instr->instr, stderr);
1101 fprintf(stderr, "\n");
1102 }
1103 break;
1104 }
1105 case nir_op_usub_borrow: {
1106 Temp src0 = get_alu_src(ctx, instr->src[0]);
1107 Temp src1 = get_alu_src(ctx, instr->src[1]);
1108 if (dst.regClass() == s1) {
1109 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1110 break;
1111 } else if (dst.regClass() == v1) {
1112 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1113 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1114 break;
1115 }
1116
1117 Temp src00 = bld.tmp(src0.type(), 1);
1118 Temp src01 = bld.tmp(dst.type(), 1);
1119 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1120 Temp src10 = bld.tmp(src1.type(), 1);
1121 Temp src11 = bld.tmp(dst.type(), 1);
1122 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1123 if (dst.regClass() == s2) {
1124 Temp borrow = bld.tmp(s1);
1125 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1126 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1127 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1128 } else if (dst.regClass() == v2) {
1129 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1130 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1131 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1132 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1133 } else {
1134 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1135 nir_print_instr(&instr->instr, stderr);
1136 fprintf(stderr, "\n");
1137 }
1138 break;
1139 }
1140 case nir_op_imul: {
1141 if (dst.regClass() == v1) {
1142 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1143 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1144 } else if (dst.regClass() == s1) {
1145 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1146 } else {
1147 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1148 nir_print_instr(&instr->instr, stderr);
1149 fprintf(stderr, "\n");
1150 }
1151 break;
1152 }
1153 case nir_op_umul_high: {
1154 if (dst.regClass() == v1) {
1155 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1156 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1157 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1158 } else if (dst.regClass() == s1) {
1159 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1160 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1161 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1162 } else {
1163 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1164 nir_print_instr(&instr->instr, stderr);
1165 fprintf(stderr, "\n");
1166 }
1167 break;
1168 }
1169 case nir_op_imul_high: {
1170 if (dst.regClass() == v1) {
1171 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1172 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1173 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1174 } else if (dst.regClass() == s1) {
1175 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1176 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1177 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1178 } else {
1179 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1180 nir_print_instr(&instr->instr, stderr);
1181 fprintf(stderr, "\n");
1182 }
1183 break;
1184 }
1185 case nir_op_fmul: {
1186 if (dst.size() == 1) {
1187 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1188 } else if (dst.size() == 2) {
1189 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1190 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1191 } else {
1192 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1193 nir_print_instr(&instr->instr, stderr);
1194 fprintf(stderr, "\n");
1195 }
1196 break;
1197 }
1198 case nir_op_fadd: {
1199 if (dst.size() == 1) {
1200 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1201 } else if (dst.size() == 2) {
1202 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1203 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1204 } else {
1205 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1206 nir_print_instr(&instr->instr, stderr);
1207 fprintf(stderr, "\n");
1208 }
1209 break;
1210 }
1211 case nir_op_fsub: {
1212 Temp src0 = get_alu_src(ctx, instr->src[0]);
1213 Temp src1 = get_alu_src(ctx, instr->src[1]);
1214 if (dst.size() == 1) {
1215 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1216 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1217 else
1218 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1219 } else if (dst.size() == 2) {
1220 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1221 get_alu_src(ctx, instr->src[0]),
1222 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1223 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1224 sub->neg[1] = true;
1225 } else {
1226 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227 nir_print_instr(&instr->instr, stderr);
1228 fprintf(stderr, "\n");
1229 }
1230 break;
1231 }
1232 case nir_op_fmax: {
1233 if (dst.size() == 1) {
1234 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1235 } else if (dst.size() == 2) {
1236 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1237 get_alu_src(ctx, instr->src[0]),
1238 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1239 } else {
1240 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1241 nir_print_instr(&instr->instr, stderr);
1242 fprintf(stderr, "\n");
1243 }
1244 break;
1245 }
1246 case nir_op_fmin: {
1247 if (dst.size() == 1) {
1248 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1249 } else if (dst.size() == 2) {
1250 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1251 get_alu_src(ctx, instr->src[0]),
1252 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1253 } else {
1254 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1255 nir_print_instr(&instr->instr, stderr);
1256 fprintf(stderr, "\n");
1257 }
1258 break;
1259 }
1260 case nir_op_fmax3: {
1261 if (dst.size() == 1) {
1262 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1263 } else {
1264 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1265 nir_print_instr(&instr->instr, stderr);
1266 fprintf(stderr, "\n");
1267 }
1268 break;
1269 }
1270 case nir_op_fmin3: {
1271 if (dst.size() == 1) {
1272 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1273 } else {
1274 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1275 nir_print_instr(&instr->instr, stderr);
1276 fprintf(stderr, "\n");
1277 }
1278 break;
1279 }
1280 case nir_op_fmed3: {
1281 if (dst.size() == 1) {
1282 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_umax3: {
1291 if (dst.size() == 1) {
1292 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1293 } else {
1294 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1295 nir_print_instr(&instr->instr, stderr);
1296 fprintf(stderr, "\n");
1297 }
1298 break;
1299 }
1300 case nir_op_umin3: {
1301 if (dst.size() == 1) {
1302 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1303 } else {
1304 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1305 nir_print_instr(&instr->instr, stderr);
1306 fprintf(stderr, "\n");
1307 }
1308 break;
1309 }
1310 case nir_op_umed3: {
1311 if (dst.size() == 1) {
1312 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1313 } else {
1314 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1315 nir_print_instr(&instr->instr, stderr);
1316 fprintf(stderr, "\n");
1317 }
1318 break;
1319 }
1320 case nir_op_imax3: {
1321 if (dst.size() == 1) {
1322 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1323 } else {
1324 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1325 nir_print_instr(&instr->instr, stderr);
1326 fprintf(stderr, "\n");
1327 }
1328 break;
1329 }
1330 case nir_op_imin3: {
1331 if (dst.size() == 1) {
1332 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1333 } else {
1334 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1335 nir_print_instr(&instr->instr, stderr);
1336 fprintf(stderr, "\n");
1337 }
1338 break;
1339 }
1340 case nir_op_imed3: {
1341 if (dst.size() == 1) {
1342 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1343 } else {
1344 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1345 nir_print_instr(&instr->instr, stderr);
1346 fprintf(stderr, "\n");
1347 }
1348 break;
1349 }
1350 case nir_op_cube_face_coord: {
1351 Temp in = get_alu_src(ctx, instr->src[0], 3);
1352 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1353 emit_extract_vector(ctx, in, 1, v1),
1354 emit_extract_vector(ctx, in, 2, v1) };
1355 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1356 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1357 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1358 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1359 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1360 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1361 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1362 break;
1363 }
1364 case nir_op_cube_face_index: {
1365 Temp in = get_alu_src(ctx, instr->src[0], 3);
1366 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1367 emit_extract_vector(ctx, in, 1, v1),
1368 emit_extract_vector(ctx, in, 2, v1) };
1369 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1370 break;
1371 }
1372 case nir_op_bcsel: {
1373 emit_bcsel(ctx, instr, dst);
1374 break;
1375 }
1376 case nir_op_frsq: {
1377 if (dst.size() == 1) {
1378 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1379 } else if (dst.size() == 2) {
1380 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1381 } else {
1382 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1383 nir_print_instr(&instr->instr, stderr);
1384 fprintf(stderr, "\n");
1385 }
1386 break;
1387 }
1388 case nir_op_fneg: {
1389 Temp src = get_alu_src(ctx, instr->src[0]);
1390 if (dst.size() == 1) {
1391 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1392 } else if (dst.size() == 2) {
1393 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1394 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1395 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1396 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1397 } else {
1398 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1399 nir_print_instr(&instr->instr, stderr);
1400 fprintf(stderr, "\n");
1401 }
1402 break;
1403 }
1404 case nir_op_fabs: {
1405 Temp src = get_alu_src(ctx, instr->src[0]);
1406 if (dst.size() == 1) {
1407 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1408 } else if (dst.size() == 2) {
1409 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1410 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1411 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1412 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1413 } else {
1414 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1415 nir_print_instr(&instr->instr, stderr);
1416 fprintf(stderr, "\n");
1417 }
1418 break;
1419 }
1420 case nir_op_fsat: {
1421 Temp src = get_alu_src(ctx, instr->src[0]);
1422 if (dst.size() == 1) {
1423 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1424 } else if (dst.size() == 2) {
1425 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1426 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1427 vop3->clamp = true;
1428 } else {
1429 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1430 nir_print_instr(&instr->instr, stderr);
1431 fprintf(stderr, "\n");
1432 }
1433 break;
1434 }
1435 case nir_op_flog2: {
1436 if (dst.size() == 1) {
1437 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1438 } else {
1439 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1440 nir_print_instr(&instr->instr, stderr);
1441 fprintf(stderr, "\n");
1442 }
1443 break;
1444 }
1445 case nir_op_frcp: {
1446 if (dst.size() == 1) {
1447 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1448 } else if (dst.size() == 2) {
1449 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1450 } else {
1451 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1452 nir_print_instr(&instr->instr, stderr);
1453 fprintf(stderr, "\n");
1454 }
1455 break;
1456 }
1457 case nir_op_fexp2: {
1458 if (dst.size() == 1) {
1459 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1460 } else {
1461 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1462 nir_print_instr(&instr->instr, stderr);
1463 fprintf(stderr, "\n");
1464 }
1465 break;
1466 }
1467 case nir_op_fsqrt: {
1468 if (dst.size() == 1) {
1469 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1470 } else if (dst.size() == 2) {
1471 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1472 } else {
1473 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1474 nir_print_instr(&instr->instr, stderr);
1475 fprintf(stderr, "\n");
1476 }
1477 break;
1478 }
1479 case nir_op_ffract: {
1480 if (dst.size() == 1) {
1481 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1482 } else if (dst.size() == 2) {
1483 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1484 } else {
1485 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1486 nir_print_instr(&instr->instr, stderr);
1487 fprintf(stderr, "\n");
1488 }
1489 break;
1490 }
1491 case nir_op_ffloor: {
1492 if (dst.size() == 1) {
1493 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1494 } else if (dst.size() == 2) {
1495 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1496 } else {
1497 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1498 nir_print_instr(&instr->instr, stderr);
1499 fprintf(stderr, "\n");
1500 }
1501 break;
1502 }
1503 case nir_op_fceil: {
1504 if (dst.size() == 1) {
1505 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1506 } else if (dst.size() == 2) {
1507 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1508 } else {
1509 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1510 nir_print_instr(&instr->instr, stderr);
1511 fprintf(stderr, "\n");
1512 }
1513 break;
1514 }
1515 case nir_op_ftrunc: {
1516 if (dst.size() == 1) {
1517 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1518 } else if (dst.size() == 2) {
1519 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1520 } else {
1521 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1522 nir_print_instr(&instr->instr, stderr);
1523 fprintf(stderr, "\n");
1524 }
1525 break;
1526 }
1527 case nir_op_fround_even: {
1528 if (dst.size() == 1) {
1529 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1530 } else if (dst.size() == 2) {
1531 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1532 } else {
1533 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1534 nir_print_instr(&instr->instr, stderr);
1535 fprintf(stderr, "\n");
1536 }
1537 break;
1538 }
1539 case nir_op_fsin:
1540 case nir_op_fcos: {
1541 Temp src = get_alu_src(ctx, instr->src[0]);
1542 aco_ptr<Instruction> norm;
1543 if (dst.size() == 1) {
1544 Temp tmp;
1545 Operand half_pi(0x3e22f983u);
1546 if (src.type() == RegType::sgpr)
1547 tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1548 else
1549 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1550
1551 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1552 if (ctx->options->chip_class < GFX9)
1553 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1554
1555 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1556 bld.vop1(opcode, Definition(dst), tmp);
1557 } else {
1558 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559 nir_print_instr(&instr->instr, stderr);
1560 fprintf(stderr, "\n");
1561 }
1562 break;
1563 }
1564 case nir_op_ldexp: {
1565 if (dst.size() == 1) {
1566 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1567 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1568 get_alu_src(ctx, instr->src[1]));
1569 } else if (dst.size() == 2) {
1570 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1571 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1572 get_alu_src(ctx, instr->src[1]));
1573 } else {
1574 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1575 nir_print_instr(&instr->instr, stderr);
1576 fprintf(stderr, "\n");
1577 }
1578 break;
1579 }
1580 case nir_op_frexp_sig: {
1581 if (dst.size() == 1) {
1582 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1583 get_alu_src(ctx, instr->src[0]));
1584 } else if (dst.size() == 2) {
1585 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1586 get_alu_src(ctx, instr->src[0]));
1587 } else {
1588 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1589 nir_print_instr(&instr->instr, stderr);
1590 fprintf(stderr, "\n");
1591 }
1592 break;
1593 }
1594 case nir_op_frexp_exp: {
1595 if (instr->src[0].src.ssa->bit_size == 32) {
1596 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1597 get_alu_src(ctx, instr->src[0]));
1598 } else if (instr->src[0].src.ssa->bit_size == 64) {
1599 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1600 get_alu_src(ctx, instr->src[0]));
1601 } else {
1602 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1603 nir_print_instr(&instr->instr, stderr);
1604 fprintf(stderr, "\n");
1605 }
1606 break;
1607 }
1608 case nir_op_fsign: {
1609 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1610 if (dst.size() == 1) {
1611 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1612 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1613 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1614 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1615 } else if (dst.size() == 2) {
1616 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1617 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1618 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1619
1620 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1621 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1622 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1623
1624 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1625 } else {
1626 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1627 nir_print_instr(&instr->instr, stderr);
1628 fprintf(stderr, "\n");
1629 }
1630 break;
1631 }
1632 case nir_op_f2f32: {
1633 if (instr->src[0].src.ssa->bit_size == 64) {
1634 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1635 } else {
1636 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1637 nir_print_instr(&instr->instr, stderr);
1638 fprintf(stderr, "\n");
1639 }
1640 break;
1641 }
1642 case nir_op_f2f64: {
1643 if (instr->src[0].src.ssa->bit_size == 32) {
1644 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1645 } else {
1646 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1647 nir_print_instr(&instr->instr, stderr);
1648 fprintf(stderr, "\n");
1649 }
1650 break;
1651 }
1652 case nir_op_i2f32: {
1653 assert(dst.size() == 1);
1654 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1655 break;
1656 }
1657 case nir_op_i2f64: {
1658 if (instr->src[0].src.ssa->bit_size == 32) {
1659 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1660 } else if (instr->src[0].src.ssa->bit_size == 64) {
1661 Temp src = get_alu_src(ctx, instr->src[0]);
1662 RegClass rc = RegClass(src.type(), 1);
1663 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1664 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1665 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1666 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1667 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1668 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1669
1670 } else {
1671 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1672 nir_print_instr(&instr->instr, stderr);
1673 fprintf(stderr, "\n");
1674 }
1675 break;
1676 }
1677 case nir_op_u2f32: {
1678 assert(dst.size() == 1);
1679 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1680 break;
1681 }
1682 case nir_op_u2f64: {
1683 if (instr->src[0].src.ssa->bit_size == 32) {
1684 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1685 } else if (instr->src[0].src.ssa->bit_size == 64) {
1686 Temp src = get_alu_src(ctx, instr->src[0]);
1687 RegClass rc = RegClass(src.type(), 1);
1688 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1689 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1690 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1691 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1692 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1693 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1694 } else {
1695 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1696 nir_print_instr(&instr->instr, stderr);
1697 fprintf(stderr, "\n");
1698 }
1699 break;
1700 }
1701 case nir_op_f2i32: {
1702 Temp src = get_alu_src(ctx, instr->src[0]);
1703 if (instr->src[0].src.ssa->bit_size == 32) {
1704 if (dst.type() == RegType::vgpr)
1705 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1706 else
1707 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1708 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1709
1710 } else if (instr->src[0].src.ssa->bit_size == 64) {
1711 if (dst.type() == RegType::vgpr)
1712 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1713 else
1714 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1715 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1716
1717 } else {
1718 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1719 nir_print_instr(&instr->instr, stderr);
1720 fprintf(stderr, "\n");
1721 }
1722 break;
1723 }
1724 case nir_op_f2u32: {
1725 Temp src = get_alu_src(ctx, instr->src[0]);
1726 if (instr->src[0].src.ssa->bit_size == 32) {
1727 if (dst.type() == RegType::vgpr)
1728 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1729 else
1730 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1731 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1732
1733 } else if (instr->src[0].src.ssa->bit_size == 64) {
1734 if (dst.type() == RegType::vgpr)
1735 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1736 else
1737 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1738 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1739
1740 } else {
1741 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1742 nir_print_instr(&instr->instr, stderr);
1743 fprintf(stderr, "\n");
1744 }
1745 break;
1746 }
1747 case nir_op_f2i64: {
1748 Temp src = get_alu_src(ctx, instr->src[0]);
1749 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1750 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1751 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1752 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1753 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1754 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1755 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1756 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1757 Temp new_exponent = bld.tmp(v1);
1758 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1759 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1760 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1761 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1762 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1763 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1764 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1765 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1766 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1767 Temp new_lower = bld.tmp(v1);
1768 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1769 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1770 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1771
1772 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1773 if (src.type() == RegType::vgpr)
1774 src = bld.as_uniform(src);
1775 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1776 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1777 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1778 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1779 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1780 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1781 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1782 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1783 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1784 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1785 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1786 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1787 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1788 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1789 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1790 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1791 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1792 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1793 Temp borrow = bld.tmp(s1);
1794 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1795 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1796 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1797
1798 } else if (instr->src[0].src.ssa->bit_size == 64) {
1799 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1800 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1801 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1802 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1803 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1804 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1805 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1806 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1807 if (dst.type() == RegType::sgpr) {
1808 lower = bld.as_uniform(lower);
1809 upper = bld.as_uniform(upper);
1810 }
1811 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1812
1813 } else {
1814 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1815 nir_print_instr(&instr->instr, stderr);
1816 fprintf(stderr, "\n");
1817 }
1818 break;
1819 }
1820 case nir_op_f2u64: {
1821 Temp src = get_alu_src(ctx, instr->src[0]);
1822 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1823 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1824 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1825 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1826 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1827 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1828 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1829 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1830 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1831 Temp new_exponent = bld.tmp(v1);
1832 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1833 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1834 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1835 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1836 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1837 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1838 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1839 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1840 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1841
1842 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1843 if (src.type() == RegType::vgpr)
1844 src = bld.as_uniform(src);
1845 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1846 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1847 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1848 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1849 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1850 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1851 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1852 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1853 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1854 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1855 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1856 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1857 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1858 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1859 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1860 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1861 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1862 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1863
1864 } else if (instr->src[0].src.ssa->bit_size == 64) {
1865 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1866 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1867 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1868 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1869 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1870 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1871 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1872 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1873 if (dst.type() == RegType::sgpr) {
1874 lower = bld.as_uniform(lower);
1875 upper = bld.as_uniform(upper);
1876 }
1877 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1878
1879 } else {
1880 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1881 nir_print_instr(&instr->instr, stderr);
1882 fprintf(stderr, "\n");
1883 }
1884 break;
1885 }
1886 case nir_op_b2f32: {
1887 Temp src = get_alu_src(ctx, instr->src[0]);
1888 if (dst.regClass() == s1) {
1889 src = as_uniform_bool(ctx, src);
1890 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1891 } else if (dst.regClass() == v1) {
1892 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1893 as_divergent_bool(ctx, src, true));
1894 } else {
1895 unreachable("Wrong destination register class for nir_op_b2f32.");
1896 }
1897 break;
1898 }
1899 case nir_op_b2f64: {
1900 Temp src = get_alu_src(ctx, instr->src[0]);
1901 if (dst.regClass() == s2) {
1902 src = as_uniform_bool(ctx, src);
1903 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1904 } else if (dst.regClass() == v2) {
1905 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1906 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1907 as_divergent_bool(ctx, src, true));
1908 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1909 } else {
1910 unreachable("Wrong destination register class for nir_op_b2f64.");
1911 }
1912 break;
1913 }
1914 case nir_op_i2i32: {
1915 Temp src = get_alu_src(ctx, instr->src[0]);
1916 if (instr->src[0].src.ssa->bit_size == 64) {
1917 /* we can actually just say dst = src, as it would map the lower register */
1918 emit_extract_vector(ctx, src, 0, dst);
1919 } else {
1920 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1921 nir_print_instr(&instr->instr, stderr);
1922 fprintf(stderr, "\n");
1923 }
1924 break;
1925 }
1926 case nir_op_u2u32: {
1927 Temp src = get_alu_src(ctx, instr->src[0]);
1928 if (instr->src[0].src.ssa->bit_size == 16) {
1929 if (dst.regClass() == s1) {
1930 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1931 } else {
1932 // TODO: do better with SDWA
1933 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1934 }
1935 } else if (instr->src[0].src.ssa->bit_size == 64) {
1936 /* we can actually just say dst = src, as it would map the lower register */
1937 emit_extract_vector(ctx, src, 0, dst);
1938 } else {
1939 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1940 nir_print_instr(&instr->instr, stderr);
1941 fprintf(stderr, "\n");
1942 }
1943 break;
1944 }
1945 case nir_op_i2i64: {
1946 Temp src = get_alu_src(ctx, instr->src[0]);
1947 if (instr->src[0].src.ssa->bit_size == 32) {
1948 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1949 } else {
1950 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1951 nir_print_instr(&instr->instr, stderr);
1952 fprintf(stderr, "\n");
1953 }
1954 break;
1955 }
1956 case nir_op_u2u64: {
1957 Temp src = get_alu_src(ctx, instr->src[0]);
1958 if (instr->src[0].src.ssa->bit_size == 32) {
1959 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1960 } else {
1961 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1962 nir_print_instr(&instr->instr, stderr);
1963 fprintf(stderr, "\n");
1964 }
1965 break;
1966 }
1967 case nir_op_b2i32: {
1968 Temp src = get_alu_src(ctx, instr->src[0]);
1969 if (dst.regClass() == s1) {
1970 if (src.regClass() == s1) {
1971 bld.copy(Definition(dst), src);
1972 } else {
1973 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1974 assert(src.regClass() == s2);
1975 bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1976 }
1977 } else {
1978 assert(dst.regClass() == v1 && src.regClass() == s2);
1979 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1980 }
1981 break;
1982 }
1983 case nir_op_i2b1: {
1984 Temp src = get_alu_src(ctx, instr->src[0]);
1985 if (dst.regClass() == s2) {
1986 assert(src.regClass() == v1 || src.regClass() == v2);
1987 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1988 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1989 } else {
1990 assert(src.regClass() == s1 && dst.regClass() == s1);
1991 bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
1992 }
1993 break;
1994 }
1995 case nir_op_pack_64_2x32_split: {
1996 Temp src0 = get_alu_src(ctx, instr->src[0]);
1997 Temp src1 = get_alu_src(ctx, instr->src[1]);
1998
1999 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2000 break;
2001 }
2002 case nir_op_unpack_64_2x32_split_x:
2003 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2004 break;
2005 case nir_op_unpack_64_2x32_split_y:
2006 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2007 break;
2008 case nir_op_pack_half_2x16: {
2009 Temp src = get_alu_src(ctx, instr->src[0], 2);
2010
2011 if (dst.regClass() == v1) {
2012 Temp src0 = bld.tmp(v1);
2013 Temp src1 = bld.tmp(v1);
2014 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2015 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2016
2017 } else {
2018 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2019 nir_print_instr(&instr->instr, stderr);
2020 fprintf(stderr, "\n");
2021 }
2022 break;
2023 }
2024 case nir_op_unpack_half_2x16_split_x: {
2025 if (dst.regClass() == v1) {
2026 Builder bld(ctx->program, ctx->block);
2027 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2028 } else {
2029 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2030 nir_print_instr(&instr->instr, stderr);
2031 fprintf(stderr, "\n");
2032 }
2033 break;
2034 }
2035 case nir_op_unpack_half_2x16_split_y: {
2036 if (dst.regClass() == v1) {
2037 Builder bld(ctx->program, ctx->block);
2038 /* TODO: use SDWA here */
2039 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2040 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2041 } else {
2042 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2043 nir_print_instr(&instr->instr, stderr);
2044 fprintf(stderr, "\n");
2045 }
2046 break;
2047 }
2048 case nir_op_fquantize2f16: {
2049 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2050
2051 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2052
2053 Temp cmp_res = bld.tmp(s2);
2054 bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2055
2056 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2057
2058 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2059 break;
2060 }
2061 case nir_op_bfm: {
2062 Temp bits = get_alu_src(ctx, instr->src[0]);
2063 Temp offset = get_alu_src(ctx, instr->src[1]);
2064
2065 if (dst.regClass() == s1) {
2066 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2067 } else if (dst.regClass() == v1) {
2068 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2069 } else {
2070 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2071 nir_print_instr(&instr->instr, stderr);
2072 fprintf(stderr, "\n");
2073 }
2074 break;
2075 }
2076 case nir_op_bitfield_select: {
2077 /* (mask & insert) | (~mask & base) */
2078 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2079 Temp insert = get_alu_src(ctx, instr->src[1]);
2080 Temp base = get_alu_src(ctx, instr->src[2]);
2081
2082 /* dst = (insert & bitmask) | (base & ~bitmask) */
2083 if (dst.regClass() == s1) {
2084 aco_ptr<Instruction> sop2;
2085 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2086 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2087 Operand lhs;
2088 if (const_insert && const_bitmask) {
2089 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2090 } else {
2091 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2092 lhs = Operand(insert);
2093 }
2094
2095 Operand rhs;
2096 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2097 if (const_base && const_bitmask) {
2098 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2099 } else {
2100 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2101 rhs = Operand(base);
2102 }
2103
2104 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2105
2106 } else if (dst.regClass() == v1) {
2107 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2108 base = as_vgpr(ctx, base);
2109 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2110 insert = as_vgpr(ctx, insert);
2111
2112 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2113
2114 } else {
2115 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2116 nir_print_instr(&instr->instr, stderr);
2117 fprintf(stderr, "\n");
2118 }
2119 break;
2120 }
2121 case nir_op_ubfe:
2122 case nir_op_ibfe: {
2123 Temp base = get_alu_src(ctx, instr->src[0]);
2124 Temp offset = get_alu_src(ctx, instr->src[1]);
2125 Temp bits = get_alu_src(ctx, instr->src[2]);
2126
2127 if (dst.type() == RegType::sgpr) {
2128 Operand extract;
2129 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2130 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2131 if (const_offset && const_bits) {
2132 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2133 extract = Operand(const_extract);
2134 } else {
2135 Operand width;
2136 if (const_bits) {
2137 width = Operand(const_bits->u32 << 16);
2138 } else {
2139 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2140 }
2141 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2142 }
2143
2144 aco_opcode opcode;
2145 if (dst.regClass() == s1) {
2146 if (instr->op == nir_op_ubfe)
2147 opcode = aco_opcode::s_bfe_u32;
2148 else
2149 opcode = aco_opcode::s_bfe_i32;
2150 } else if (dst.regClass() == s2) {
2151 if (instr->op == nir_op_ubfe)
2152 opcode = aco_opcode::s_bfe_u64;
2153 else
2154 opcode = aco_opcode::s_bfe_i64;
2155 } else {
2156 unreachable("Unsupported BFE bit size");
2157 }
2158
2159 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2160
2161 } else {
2162 aco_opcode opcode;
2163 if (dst.regClass() == v1) {
2164 if (instr->op == nir_op_ubfe)
2165 opcode = aco_opcode::v_bfe_u32;
2166 else
2167 opcode = aco_opcode::v_bfe_i32;
2168 } else {
2169 unreachable("Unsupported BFE bit size");
2170 }
2171
2172 emit_vop3a_instruction(ctx, instr, opcode, dst);
2173 }
2174 break;
2175 }
2176 case nir_op_bit_count: {
2177 Temp src = get_alu_src(ctx, instr->src[0]);
2178 if (src.regClass() == s1) {
2179 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2180 } else if (src.regClass() == v1) {
2181 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2182 } else if (src.regClass() == v2) {
2183 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2184 emit_extract_vector(ctx, src, 1, v1),
2185 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2186 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2187 } else if (src.regClass() == s2) {
2188 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2189 } else {
2190 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2191 nir_print_instr(&instr->instr, stderr);
2192 fprintf(stderr, "\n");
2193 }
2194 break;
2195 }
2196 case nir_op_flt: {
2197 if (instr->src[0].src.ssa->bit_size == 32)
2198 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2199 else if (instr->src[0].src.ssa->bit_size == 64)
2200 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2201 break;
2202 }
2203 case nir_op_fge: {
2204 if (instr->src[0].src.ssa->bit_size == 32)
2205 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2206 else if (instr->src[0].src.ssa->bit_size == 64)
2207 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2208 break;
2209 }
2210 case nir_op_feq: {
2211 if (instr->src[0].src.ssa->bit_size == 32)
2212 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2213 else if (instr->src[0].src.ssa->bit_size == 64)
2214 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2215 break;
2216 }
2217 case nir_op_fne: {
2218 if (instr->src[0].src.ssa->bit_size == 32)
2219 emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2220 else if (instr->src[0].src.ssa->bit_size == 64)
2221 emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2222 break;
2223 }
2224 case nir_op_ilt: {
2225 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2226 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2227 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2228 emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2229 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2230 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2231 break;
2232 }
2233 case nir_op_ige: {
2234 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2235 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2236 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2237 emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2238 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2239 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2240 break;
2241 }
2242 case nir_op_ieq: {
2243 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2244 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2245 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2246 emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2247 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2248 emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2249 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2250 emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2251 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2252 Temp src0 = get_alu_src(ctx, instr->src[0]);
2253 Temp src1 = get_alu_src(ctx, instr->src[1]);
2254 bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2255 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2256 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2257 Temp src0 = get_alu_src(ctx, instr->src[0]);
2258 Temp src1 = get_alu_src(ctx, instr->src[1]);
2259 bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2260 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2261 } else {
2262 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2263 nir_print_instr(&instr->instr, stderr);
2264 fprintf(stderr, "\n");
2265 }
2266 break;
2267 }
2268 case nir_op_ine: {
2269 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2270 emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2271 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2272 emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2273 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2274 emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2275 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2276 emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2277 } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2278 Temp src0 = get_alu_src(ctx, instr->src[0]);
2279 Temp src1 = get_alu_src(ctx, instr->src[1]);
2280 bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2281 as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2282 } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2283 Temp src0 = get_alu_src(ctx, instr->src[0]);
2284 Temp src1 = get_alu_src(ctx, instr->src[1]);
2285 bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2286 as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2287 } else {
2288 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2289 nir_print_instr(&instr->instr, stderr);
2290 fprintf(stderr, "\n");
2291 }
2292 break;
2293 }
2294 case nir_op_ult: {
2295 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2296 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2297 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2298 emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2299 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2300 emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2301 break;
2302 }
2303 case nir_op_uge: {
2304 if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2305 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2306 else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2307 emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2308 else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2309 emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2310 break;
2311 }
2312 case nir_op_fddx:
2313 case nir_op_fddy:
2314 case nir_op_fddx_fine:
2315 case nir_op_fddy_fine:
2316 case nir_op_fddx_coarse:
2317 case nir_op_fddy_coarse: {
2318 Definition tl = bld.def(v1);
2319 uint16_t dpp_ctrl;
2320 if (instr->op == nir_op_fddx_fine) {
2321 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2322 dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2323 } else if (instr->op == nir_op_fddy_fine) {
2324 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2325 dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2326 } else {
2327 bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2328 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2329 dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2330 else
2331 dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2332 }
2333
2334 Definition tmp = bld.def(v1);
2335 bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2336 emit_wqm(ctx, tmp.getTemp(), dst, true);
2337 break;
2338 }
2339 default:
2340 fprintf(stderr, "Unknown NIR ALU instr: ");
2341 nir_print_instr(&instr->instr, stderr);
2342 fprintf(stderr, "\n");
2343 }
2344 }
2345
2346 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2347 {
2348 Temp dst = get_ssa_temp(ctx, &instr->def);
2349
2350 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2351 // which get truncated the lsb if double and msb if int
2352 // for now, we only use s_mov_b64 with 64bit inline constants
2353 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2354 assert(dst.type() == RegType::sgpr);
2355
2356 if (dst.size() == 1)
2357 {
2358 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2359 } else {
2360 assert(dst.size() != 1);
2361 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2362 if (instr->def.bit_size == 64)
2363 for (unsigned i = 0; i < dst.size(); i++)
2364 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2365 else {
2366 for (unsigned i = 0; i < dst.size(); i++)
2367 vec->operands[i] = Operand{instr->value[i].u32};
2368 }
2369 vec->definitions[0] = Definition(dst);
2370 ctx->block->instructions.emplace_back(std::move(vec));
2371 }
2372 }
2373
2374 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2375 {
2376 uint32_t new_mask = 0;
2377 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2378 if (mask & (1u << i))
2379 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2380 return new_mask;
2381 }
2382
2383 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2384 {
2385 /* This wouldn't work inside control flow or with indirect offsets but
2386 * that doesn't happen because of nir_lower_io_to_temporaries(). */
2387
2388 unsigned write_mask = nir_intrinsic_write_mask(instr);
2389 unsigned component = nir_intrinsic_component(instr);
2390 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2391 unsigned idx = nir_intrinsic_base(instr) + component;
2392
2393 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2394 if (off_instr->type != nir_instr_type_load_const) {
2395 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2396 nir_print_instr(off_instr, stderr);
2397 fprintf(stderr, "\n");
2398 }
2399 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2400
2401 if (instr->src[0].ssa->bit_size == 64)
2402 write_mask = widen_mask(write_mask, 2);
2403
2404 for (unsigned i = 0; i < 8; ++i) {
2405 if (write_mask & (1 << i)) {
2406 ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2407 ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2408 }
2409 idx++;
2410 }
2411 }
2412
2413 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2414 {
2415 unsigned write_mask = nir_intrinsic_write_mask(instr);
2416 Operand values[4];
2417 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2418 for (unsigned i = 0; i < 4; ++i) {
2419 if (write_mask & (1 << i)) {
2420 Temp tmp = emit_extract_vector(ctx, src, i, v1);
2421 values[i] = Operand(tmp);
2422 } else {
2423 values[i] = Operand(v1);
2424 }
2425 }
2426
2427 unsigned index = nir_intrinsic_base(instr) / 4;
2428 unsigned target, col_format;
2429 unsigned enabled_channels = 0xF;
2430 aco_opcode compr_op = (aco_opcode)0;
2431
2432 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2433 assert(offset && "Non-const offsets on exports not yet supported");
2434 index += offset->u32;
2435
2436 assert(index != FRAG_RESULT_COLOR);
2437
2438 /* Unlike vertex shader exports, it's fine to use multiple exports to
2439 * export separate channels of one target. So shaders which export both
2440 * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2441 * TODO: combine the exports in those cases and create better code
2442 */
2443
2444 if (index == FRAG_RESULT_SAMPLE_MASK) {
2445
2446 if (ctx->program->info->ps.writes_z) {
2447 target = V_008DFC_SQ_EXP_MRTZ;
2448 enabled_channels = 0x4;
2449 col_format = (unsigned) -1;
2450
2451 values[2] = values[0];
2452 values[0] = Operand(v1);
2453 } else {
2454 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2455 exp->valid_mask = false;
2456 exp->done = false;
2457 exp->compressed = true;
2458 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2459 exp->enabled_mask = 0xc;
2460 for (int i = 0; i < 4; i++)
2461 exp->operands[i] = Operand(v1);
2462 exp->operands[1] = Operand(values[0]);
2463 ctx->block->instructions.emplace_back(std::move(exp));
2464 return;
2465 }
2466
2467 } else if (index == FRAG_RESULT_DEPTH) {
2468
2469 target = V_008DFC_SQ_EXP_MRTZ;
2470 enabled_channels = 0x1;
2471 col_format = (unsigned) -1;
2472
2473 } else if (index == FRAG_RESULT_STENCIL) {
2474
2475 if (ctx->program->info->ps.writes_z) {
2476 target = V_008DFC_SQ_EXP_MRTZ;
2477 enabled_channels = 0x2;
2478 col_format = (unsigned) -1;
2479
2480 values[1] = values[0];
2481 values[0] = Operand(v1);
2482 } else {
2483 aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2484 shift->operands[0] = Operand((uint32_t) 16);
2485 shift->operands[1] = values[0];
2486 Temp tmp = {ctx->program->allocateId(), v1};
2487 shift->definitions[0] = Definition(tmp);
2488 ctx->block->instructions.emplace_back(std::move(shift));
2489
2490 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2491 exp->valid_mask = false;
2492 exp->done = false;
2493 exp->compressed = true;
2494 exp->dest = V_008DFC_SQ_EXP_MRTZ;
2495 exp->enabled_mask = 0x3;
2496 exp->operands[0] = Operand(tmp);
2497 for (int i = 1; i < 4; i++)
2498 exp->operands[i] = Operand(v1);
2499 ctx->block->instructions.emplace_back(std::move(exp));
2500 return;
2501 }
2502
2503 } else {
2504 index -= FRAG_RESULT_DATA0;
2505 target = V_008DFC_SQ_EXP_MRT + index;
2506 col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2507 }
2508 ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2509 ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2510 assert(!is_int8 && !is_int10);
2511
2512 switch (col_format)
2513 {
2514 case V_028714_SPI_SHADER_ZERO:
2515 enabled_channels = 0; /* writemask */
2516 target = V_008DFC_SQ_EXP_NULL;
2517 break;
2518
2519 case V_028714_SPI_SHADER_32_R:
2520 enabled_channels = 1;
2521 break;
2522
2523 case V_028714_SPI_SHADER_32_GR:
2524 enabled_channels = 0x3;
2525 break;
2526
2527 case V_028714_SPI_SHADER_32_AR:
2528 if (ctx->options->chip_class >= GFX10) {
2529 /* Special case: on GFX10, the outputs are different for 32_AR */
2530 enabled_channels = 0x3;
2531 values[1] = values[3];
2532 } else {
2533 enabled_channels = 0x9;
2534 }
2535 break;
2536
2537 case V_028714_SPI_SHADER_FP16_ABGR:
2538 enabled_channels = 0x5;
2539 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2540 break;
2541
2542 case V_028714_SPI_SHADER_UNORM16_ABGR:
2543 enabled_channels = 0x5;
2544 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2545 break;
2546
2547 case V_028714_SPI_SHADER_SNORM16_ABGR:
2548 enabled_channels = 0x5;
2549 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2550 break;
2551
2552 case V_028714_SPI_SHADER_UINT16_ABGR:
2553 enabled_channels = 0x5;
2554 compr_op = aco_opcode::v_cvt_pk_u16_u32;
2555 break;
2556
2557 case V_028714_SPI_SHADER_SINT16_ABGR:
2558 enabled_channels = 0x5;
2559 compr_op = aco_opcode::v_cvt_pk_i16_i32;
2560 break;
2561
2562 case V_028714_SPI_SHADER_32_ABGR:
2563 enabled_channels = 0xF;
2564 break;
2565
2566 default:
2567 break;
2568 }
2569
2570 if (target == V_008DFC_SQ_EXP_NULL)
2571 return;
2572
2573 if ((bool)compr_op)
2574 {
2575 for (int i = 0; i < 2; i++)
2576 {
2577 /* check if at least one of the values to be compressed is enabled */
2578 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2579 if (enabled) {
2580 enabled_channels |= enabled << (i*2);
2581 aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2582 Temp tmp{ctx->program->allocateId(), v1};
2583 compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2584 compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2585 compr->definitions[0] = Definition(tmp);
2586 values[i] = Operand(tmp);
2587 ctx->block->instructions.emplace_back(std::move(compr));
2588 } else {
2589 values[i] = Operand(v1);
2590 }
2591 }
2592 }
2593
2594 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2595 exp->valid_mask = false;
2596 exp->done = false;
2597 exp->compressed = (bool) compr_op;
2598 exp->dest = target;
2599 exp->enabled_mask = enabled_channels;
2600 if ((bool) compr_op) {
2601 for (int i = 0; i < 2; i++)
2602 exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2603 exp->operands[2] = Operand(v1);
2604 exp->operands[3] = Operand(v1);
2605 } else {
2606 for (int i = 0; i < 4; i++)
2607 exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2608 }
2609
2610 ctx->block->instructions.emplace_back(std::move(exp));
2611 }
2612
2613 Operand load_lds_size_m0(isel_context *ctx)
2614 {
2615 /* TODO: m0 does not need to be initialized on GFX9+ */
2616 Builder bld(ctx->program, ctx->block);
2617 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2618 }
2619
2620 void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2621 Temp address, unsigned base_offset, unsigned align)
2622 {
2623 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2624
2625 Builder bld(ctx->program, ctx->block);
2626
2627 Operand m = load_lds_size_m0(ctx);
2628
2629 unsigned num_components = dst.size() * 4u / elem_size_bytes;
2630 unsigned bytes_read = 0;
2631 unsigned result_size = 0;
2632 unsigned total_bytes = num_components * elem_size_bytes;
2633 std::array<Temp, 4> result;
2634
2635 while (bytes_read < total_bytes) {
2636 unsigned todo = total_bytes - bytes_read;
2637 bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2638 bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2639
2640 aco_opcode op = aco_opcode::last_opcode;
2641 bool read2 = false;
2642 if (todo >= 16 && aligned16) {
2643 op = aco_opcode::ds_read_b128;
2644 todo = 16;
2645 } else if (todo >= 16 && aligned8) {
2646 op = aco_opcode::ds_read2_b64;
2647 read2 = true;
2648 todo = 16;
2649 } else if (todo >= 12 && aligned16) {
2650 op = aco_opcode::ds_read_b96;
2651 todo = 12;
2652 } else if (todo >= 8 && aligned8) {
2653 op = aco_opcode::ds_read_b64;
2654 todo = 8;
2655 } else if (todo >= 8) {
2656 op = aco_opcode::ds_read2_b32;
2657 read2 = true;
2658 todo = 8;
2659 } else if (todo >= 4) {
2660 op = aco_opcode::ds_read_b32;
2661 todo = 4;
2662 } else {
2663 assert(false);
2664 }
2665 assert(todo % elem_size_bytes == 0);
2666 unsigned num_elements = todo / elem_size_bytes;
2667 unsigned offset = base_offset + bytes_read;
2668 unsigned max_offset = read2 ? 1019 : 65535;
2669
2670 Temp address_offset = address;
2671 if (offset > max_offset) {
2672 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2673 offset = bytes_read;
2674 }
2675 assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2676
2677 Temp res;
2678 if (num_components == 1 && dst.type() == RegType::vgpr)
2679 res = dst;
2680 else
2681 res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2682
2683 if (read2)
2684 res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
2685 else
2686 res = bld.ds(op, Definition(res), address_offset, m, offset);
2687
2688 if (num_components == 1) {
2689 assert(todo == total_bytes);
2690 if (dst.type() == RegType::sgpr)
2691 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2692 return;
2693 }
2694
2695 if (dst.type() == RegType::sgpr)
2696 res = bld.as_uniform(res);
2697
2698 if (num_elements == 1) {
2699 result[result_size++] = res;
2700 } else {
2701 assert(res != dst && res.size() % num_elements == 0);
2702 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2703 split->operands[0] = Operand(res);
2704 for (unsigned i = 0; i < num_elements; i++)
2705 split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2706 ctx->block->instructions.emplace_back(std::move(split));
2707 }
2708
2709 bytes_read += todo;
2710 }
2711
2712 assert(result_size == num_components && result_size > 1);
2713 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2714 for (unsigned i = 0; i < result_size; i++)
2715 vec->operands[i] = Operand(result[i]);
2716 vec->definitions[0] = Definition(dst);
2717 ctx->block->instructions.emplace_back(std::move(vec));
2718 ctx->allocated_vec.emplace(dst.id(), result);
2719 }
2720
2721 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2722 {
2723 if (start == 0 && size == data.size())
2724 return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2725
2726 unsigned size_hint = 1;
2727 auto it = ctx->allocated_vec.find(data.id());
2728 if (it != ctx->allocated_vec.end())
2729 size_hint = it->second[0].size();
2730 if (size % size_hint || start % size_hint)
2731 size_hint = 1;
2732
2733 start /= size_hint;
2734 size /= size_hint;
2735
2736 Temp elems[size];
2737 for (unsigned i = 0; i < size; i++)
2738 elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2739
2740 if (size == 1)
2741 return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2742
2743 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2744 for (unsigned i = 0; i < size; i++)
2745 vec->operands[i] = Operand(elems[i]);
2746 Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2747 vec->definitions[0] = Definition(res);
2748 ctx->block->instructions.emplace_back(std::move(vec));
2749 return res;
2750 }
2751
2752 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2753 {
2754 Builder bld(ctx->program, ctx->block);
2755 unsigned bytes_written = 0;
2756 while (bytes_written < total_size * 4) {
2757 unsigned todo = total_size * 4 - bytes_written;
2758 bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2759 bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2760
2761 aco_opcode op = aco_opcode::last_opcode;
2762 bool write2 = false;
2763 unsigned size = 0;
2764 if (todo >= 16 && aligned16) {
2765 op = aco_opcode::ds_write_b128;
2766 size = 4;
2767 } else if (todo >= 16 && aligned8) {
2768 op = aco_opcode::ds_write2_b64;
2769 write2 = true;
2770 size = 4;
2771 } else if (todo >= 12 && aligned16) {
2772 op = aco_opcode::ds_write_b96;
2773 size = 3;
2774 } else if (todo >= 8 && aligned8) {
2775 op = aco_opcode::ds_write_b64;
2776 size = 2;
2777 } else if (todo >= 8) {
2778 op = aco_opcode::ds_write2_b32;
2779 write2 = true;
2780 size = 2;
2781 } else if (todo >= 4) {
2782 op = aco_opcode::ds_write_b32;
2783 size = 1;
2784 } else {
2785 assert(false);
2786 }
2787
2788 unsigned offset = offset0 + offset1 + bytes_written;
2789 unsigned max_offset = write2 ? 1020 : 65535;
2790 Temp address_offset = address;
2791 if (offset > max_offset) {
2792 address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2793 offset = offset1 + bytes_written;
2794 }
2795 assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2796
2797 if (write2) {
2798 Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2799 Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2800 bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
2801 } else {
2802 Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2803 bld.ds(op, address_offset, val, m, offset);
2804 }
2805
2806 bytes_written += size * 4;
2807 }
2808 }
2809
2810 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2811 Temp address, unsigned base_offset, unsigned align)
2812 {
2813 assert(util_is_power_of_two_nonzero(align) && align >= 4);
2814
2815 Operand m = load_lds_size_m0(ctx);
2816
2817 /* we need at most two stores for 32bit variables */
2818 int start[2], count[2];
2819 u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2820 u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2821 assert(wrmask == 0);
2822
2823 /* one combined store is sufficient */
2824 if (count[0] == count[1]) {
2825 Builder bld(ctx->program, ctx->block);
2826
2827 Temp address_offset = address;
2828 if ((base_offset >> 2) + start[1] > 255) {
2829 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2830 base_offset = 0;
2831 }
2832
2833 assert(count[0] == 1);
2834 Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
2835 Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
2836 aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2837 base_offset = base_offset / elem_size_bytes;
2838 bld.ds(op, address_offset, val0, val1, m,
2839 base_offset + start[0], base_offset + start[1]);
2840 return;
2841 }
2842
2843 for (unsigned i = 0; i < 2; i++) {
2844 if (count[i] == 0)
2845 continue;
2846
2847 unsigned elem_size_words = elem_size_bytes / 4;
2848 ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2849 base_offset, start[i] * elem_size_bytes, align);
2850 }
2851 return;
2852 }
2853
2854 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2855 {
2856 if (ctx->stage == vertex_vs) {
2857 visit_store_vs_output(ctx, instr);
2858 } else if (ctx->stage == fragment_fs) {
2859 visit_store_fs_output(ctx, instr);
2860 } else {
2861 unreachable("Shader stage not implemented");
2862 }
2863 }
2864
2865 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2866 {
2867 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2868 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2869
2870 Builder bld(ctx->program, ctx->block);
2871 Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2872 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2873 }
2874
2875 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2876 {
2877 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2878 for (unsigned i = 0; i < num_components; i++)
2879 vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2880
2881 if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2882 assert(num_components == 4);
2883 Builder bld(ctx->program, ctx->block);
2884 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2885 }
2886
2887 for (Operand& op : vec->operands)
2888 op = op.isUndefined() ? Operand(0u) : op;
2889
2890 vec->definitions[0] = Definition(dst);
2891 ctx->block->instructions.emplace_back(std::move(vec));
2892 emit_split_vector(ctx, dst, num_components);
2893 return;
2894 }
2895
2896 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2897 {
2898 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2899 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2900 unsigned idx = nir_intrinsic_base(instr);
2901 unsigned component = nir_intrinsic_component(instr);
2902 Temp prim_mask = ctx->prim_mask;
2903
2904 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2905 if (offset) {
2906 assert(offset->u32 == 0);
2907 } else {
2908 /* the lower 15bit of the prim_mask contain the offset into LDS
2909 * while the upper bits contain the number of prims */
2910 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2911 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2912 Builder bld(ctx->program, ctx->block);
2913 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2914 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2915 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2916 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2917 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2918 }
2919
2920 if (instr->dest.ssa.num_components == 1) {
2921 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2922 } else {
2923 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2924 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2925 {
2926 Temp tmp = {ctx->program->allocateId(), v1};
2927 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2928 vec->operands[i] = Operand(tmp);
2929 }
2930 vec->definitions[0] = Definition(dst);
2931 ctx->block->instructions.emplace_back(std::move(vec));
2932 }
2933 }
2934
2935 unsigned get_num_channels_from_data_format(unsigned data_format)
2936 {
2937 switch (data_format) {
2938 case V_008F0C_BUF_DATA_FORMAT_8:
2939 case V_008F0C_BUF_DATA_FORMAT_16:
2940 case V_008F0C_BUF_DATA_FORMAT_32:
2941 return 1;
2942 case V_008F0C_BUF_DATA_FORMAT_8_8:
2943 case V_008F0C_BUF_DATA_FORMAT_16_16:
2944 case V_008F0C_BUF_DATA_FORMAT_32_32:
2945 return 2;
2946 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2947 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2948 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2949 return 3;
2950 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2951 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2952 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2953 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2954 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2955 return 4;
2956 default:
2957 break;
2958 }
2959
2960 return 4;
2961 }
2962
2963 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2964 * so we may need to fix it up. */
2965 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2966 {
2967 Builder bld(ctx->program, ctx->block);
2968
2969 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2970 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2971
2972 /* For the integer-like cases, do a natural sign extension.
2973 *
2974 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2975 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2976 * exponent.
2977 */
2978 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2979 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2980
2981 /* Convert back to the right type. */
2982 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2983 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2984 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2985 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2986 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2987 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2988 }
2989
2990 return alpha;
2991 }
2992
2993 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2994 {
2995 Builder bld(ctx->program, ctx->block);
2996 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2997 if (ctx->stage & sw_vs) {
2998
2999 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3000 if (off_instr->type != nir_instr_type_load_const) {
3001 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3002 nir_print_instr(off_instr, stderr);
3003 fprintf(stderr, "\n");
3004 }
3005 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3006
3007 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
3008
3009 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3010 unsigned component = nir_intrinsic_component(instr);
3011 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3012 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3013 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3014 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3015
3016 unsigned dfmt = attrib_format & 0xf;
3017
3018 unsigned nfmt = (attrib_format >> 4) & 0x7;
3019 unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
3020 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3021 unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
3022 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3023 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3024 if (post_shuffle)
3025 num_channels = MAX2(num_channels, 3);
3026
3027 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
3028
3029 Temp index;
3030 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3031 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3032 if (divisor) {
3033 ctx->needs_instance_id = true;
3034
3035 if (divisor != 1) {
3036 Temp divided = bld.tmp(v1);
3037 emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
3038 index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
3039 } else {
3040 index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
3041 }
3042 } else {
3043 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
3044 }
3045 } else {
3046 index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
3047 }
3048
3049 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
3050 index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
3051 attrib_offset = attrib_offset % attrib_stride;
3052 }
3053
3054 Operand soffset(0u);
3055 if (attrib_offset >= 4096) {
3056 soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
3057 attrib_offset = 0;
3058 }
3059
3060 aco_opcode opcode;
3061 switch (num_channels) {
3062 case 1:
3063 opcode = aco_opcode::tbuffer_load_format_x;
3064 break;
3065 case 2:
3066 opcode = aco_opcode::tbuffer_load_format_xy;
3067 break;
3068 case 3:
3069 opcode = aco_opcode::tbuffer_load_format_xyz;
3070 break;
3071 case 4:
3072 opcode = aco_opcode::tbuffer_load_format_xyzw;
3073 break;
3074 default:
3075 unreachable("Unimplemented load_input vector size");
3076 }
3077
3078 Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
3079
3080 aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
3081 mubuf->operands[0] = Operand(index);
3082 mubuf->operands[1] = Operand(list);
3083 mubuf->operands[2] = soffset;
3084 mubuf->definitions[0] = Definition(tmp);
3085 mubuf->idxen = true;
3086 mubuf->can_reorder = true;
3087 mubuf->dfmt = dfmt;
3088 mubuf->nfmt = nfmt;
3089 assert(attrib_offset < 4096);
3090 mubuf->offset = attrib_offset;
3091 ctx->block->instructions.emplace_back(std::move(mubuf));
3092
3093 emit_split_vector(ctx, tmp, tmp.size());
3094
3095 if (tmp.id() != dst.id()) {
3096 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3097 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3098
3099 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3100 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3101 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3102
3103 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3104 for (unsigned i = 0; i < dst.size(); i++) {
3105 unsigned idx = i + component;
3106 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
3107 Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
3108 vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
3109 } else if (idx < num_channels) {
3110 vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
3111 } else if (is_float && idx == 3) {
3112 vec->operands[i] = Operand(0x3f800000u);
3113 } else if (!is_float && idx == 3) {
3114 vec->operands[i] = Operand(1u);
3115 } else {
3116 vec->operands[i] = Operand(0u);
3117 }
3118 }
3119 vec->definitions[0] = Definition(dst);
3120 ctx->block->instructions.emplace_back(std::move(vec));
3121 emit_split_vector(ctx, dst, dst.size());
3122 }
3123
3124 } else if (ctx->stage == fragment_fs) {
3125 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3126 if (off_instr->type != nir_instr_type_load_const ||
3127 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3128 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3129 nir_print_instr(off_instr, stderr);
3130 fprintf(stderr, "\n");
3131 }
3132
3133 Temp prim_mask = ctx->prim_mask;
3134 nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
3135 if (offset) {
3136 assert(offset->u32 == 0);
3137 } else {
3138 /* the lower 15bit of the prim_mask contain the offset into LDS
3139 * while the upper bits contain the number of prims */
3140 Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
3141 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3142 Builder bld(ctx->program, ctx->block);
3143 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3144 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3145 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3146 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3147 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3148 }
3149
3150 unsigned idx = nir_intrinsic_base(instr);
3151 unsigned component = nir_intrinsic_component(instr);
3152
3153 if (dst.size() == 1) {
3154 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
3155 } else {
3156 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3157 for (unsigned i = 0; i < dst.size(); i++)
3158 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
3159 vec->definitions[0] = Definition(dst);
3160 bld.insert(std::move(vec));
3161 }
3162
3163 } else {
3164 unreachable("Shader stage not implemented");
3165 }
3166 }
3167
3168 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3169 {
3170 if (ctx->program->info->need_indirect_descriptor_sets) {
3171 Builder bld(ctx->program, ctx->block);
3172 Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
3173 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
3174 }
3175
3176 return ctx->descriptor_sets[desc_set];
3177 }
3178
3179
3180 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3181 {
3182 Builder bld(ctx->program, ctx->block);
3183 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3184 if (!ctx->divergent_vals[instr->dest.ssa.index])
3185 index = bld.as_uniform(index);
3186 unsigned desc_set = nir_intrinsic_desc_set(instr);
3187 unsigned binding = nir_intrinsic_binding(instr);
3188
3189 Temp desc_ptr;
3190 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3191 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3192 unsigned offset = layout->binding[binding].offset;
3193 unsigned stride;
3194 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3195 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3196 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3197 desc_ptr = ctx->push_constants;
3198 offset = pipeline_layout->push_constant_size + 16 * idx;
3199 stride = 16;
3200 } else {
3201 desc_ptr = load_desc_ptr(ctx, desc_set);
3202 stride = layout->binding[binding].size;
3203 }
3204
3205 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3206 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3207 if (stride != 1) {
3208 if (nir_const_index) {
3209 const_index = const_index * stride;
3210 } else if (index.type() == RegType::vgpr) {
3211 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
3212 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
3213 } else {
3214 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
3215 }
3216 }
3217 if (offset) {
3218 if (nir_const_index) {
3219 const_index = const_index + offset;
3220 } else if (index.type() == RegType::vgpr) {
3221 index = bld.vadd32(bld.def(v1), Operand(offset), index);
3222 } else {
3223 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
3224 }
3225 }
3226
3227 if (nir_const_index && const_index == 0) {
3228 index = desc_ptr;
3229 } else if (index.type() == RegType::vgpr) {
3230 index = bld.vadd32(bld.def(v1),
3231 nir_const_index ? Operand(const_index) : Operand(index),
3232 Operand(desc_ptr));
3233 } else {
3234 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3235 nir_const_index ? Operand(const_index) : Operand(index),
3236 Operand(desc_ptr));
3237 }
3238
3239 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
3240 }
3241
3242 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
3243 {
3244 Builder bld(ctx->program, ctx->block);
3245
3246 unsigned num_bytes = dst.size() * 4;
3247 bool dlc = glc && ctx->options->chip_class >= GFX10;
3248
3249 aco_opcode op;
3250 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3251 if (ctx->options->chip_class < GFX8)
3252 offset = as_vgpr(ctx, offset);
3253
3254 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3255 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3256 unsigned const_offset = 0;
3257
3258 Temp lower = Temp();
3259 if (num_bytes > 16) {
3260 assert(num_components == 3 || num_components == 4);
3261 op = aco_opcode::buffer_load_dwordx4;
3262 lower = bld.tmp(v4);
3263 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3264 mubuf->definitions[0] = Definition(lower);
3265 mubuf->operands[0] = vaddr;
3266 mubuf->operands[1] = Operand(rsrc);
3267 mubuf->operands[2] = soffset;
3268 mubuf->offen = (offset.type() == RegType::vgpr);
3269 mubuf->glc = glc;
3270 mubuf->dlc = dlc;
3271 mubuf->barrier = barrier_buffer;
3272 bld.insert(std::move(mubuf));
3273 emit_split_vector(ctx, lower, 2);
3274 num_bytes -= 16;
3275 const_offset = 16;
3276 }
3277
3278 switch (num_bytes) {
3279 case 4:
3280 op = aco_opcode::buffer_load_dword;
3281 break;
3282 case 8:
3283 op = aco_opcode::buffer_load_dwordx2;
3284 break;
3285 case 12:
3286 op = aco_opcode::buffer_load_dwordx3;
3287 break;
3288 case 16:
3289 op = aco_opcode::buffer_load_dwordx4;
3290 break;
3291 default:
3292 unreachable("Load SSBO not implemented for this size.");
3293 }
3294 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3295 mubuf->operands[0] = vaddr;
3296 mubuf->operands[1] = Operand(rsrc);
3297 mubuf->operands[2] = soffset;
3298 mubuf->offen = (offset.type() == RegType::vgpr);
3299 mubuf->glc = glc;
3300 mubuf->dlc = dlc;
3301 mubuf->barrier = barrier_buffer;
3302 mubuf->offset = const_offset;
3303 aco_ptr<Instruction> instr = std::move(mubuf);
3304
3305 if (dst.size() > 4) {
3306 assert(lower != Temp());
3307 Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3308 instr->definitions[0] = Definition(upper);
3309 bld.insert(std::move(instr));
3310 if (dst.size() == 8)
3311 emit_split_vector(ctx, upper, 2);
3312 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3313 instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3314 instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3315 instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3316 if (dst.size() == 8)
3317 instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3318 }
3319
3320 if (dst.type() == RegType::sgpr) {
3321 Temp vec = bld.tmp(RegType::vgpr, dst.size());
3322 instr->definitions[0] = Definition(vec);
3323 bld.insert(std::move(instr));
3324 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3325 } else {
3326 instr->definitions[0] = Definition(dst);
3327 bld.insert(std::move(instr));
3328 }
3329 } else {
3330 switch (num_bytes) {
3331 case 4:
3332 op = aco_opcode::s_buffer_load_dword;
3333 break;
3334 case 8:
3335 op = aco_opcode::s_buffer_load_dwordx2;
3336 break;
3337 case 12:
3338 case 16:
3339 op = aco_opcode::s_buffer_load_dwordx4;
3340 break;
3341 case 24:
3342 case 32:
3343 op = aco_opcode::s_buffer_load_dwordx8;
3344 break;
3345 default:
3346 unreachable("Load SSBO not implemented for this size.");
3347 }
3348 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3349 load->operands[0] = Operand(rsrc);
3350 load->operands[1] = Operand(bld.as_uniform(offset));
3351 assert(load->operands[1].getTemp().type() == RegType::sgpr);
3352 load->definitions[0] = Definition(dst);
3353 load->glc = glc;
3354 load->dlc = dlc;
3355 load->barrier = barrier_buffer;
3356 assert(ctx->options->chip_class >= GFX8 || !glc);
3357
3358 /* trim vector */
3359 if (dst.size() == 3) {
3360 Temp vec = bld.tmp(s4);
3361 load->definitions[0] = Definition(vec);
3362 bld.insert(std::move(load));
3363 emit_split_vector(ctx, vec, 4);
3364
3365 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3366 emit_extract_vector(ctx, vec, 0, s1),
3367 emit_extract_vector(ctx, vec, 1, s1),
3368 emit_extract_vector(ctx, vec, 2, s1));
3369 } else if (dst.size() == 6) {
3370 Temp vec = bld.tmp(s8);
3371 load->definitions[0] = Definition(vec);
3372 bld.insert(std::move(load));
3373 emit_split_vector(ctx, vec, 4);
3374
3375 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3376 emit_extract_vector(ctx, vec, 0, s2),
3377 emit_extract_vector(ctx, vec, 1, s2),
3378 emit_extract_vector(ctx, vec, 2, s2));
3379 } else {
3380 bld.insert(std::move(load));
3381 }
3382
3383 }
3384 emit_split_vector(ctx, dst, num_components);
3385 }
3386
3387 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3388 {
3389 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3390 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3391
3392 Builder bld(ctx->program, ctx->block);
3393
3394 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3395 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3396 unsigned binding = nir_intrinsic_binding(idx_instr);
3397 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3398
3399 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3400 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3401 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3402 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3403 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3404 if (ctx->options->chip_class >= GFX10) {
3405 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3406 S_008F0C_OOB_SELECT(3) |
3407 S_008F0C_RESOURCE_LEVEL(1);
3408 } else {
3409 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3410 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3411 }
3412 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3413 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3414 Operand(0xFFFFFFFFu),
3415 Operand(desc_type));
3416 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3417 rsrc, upper_dwords);
3418 } else {
3419 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3420 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3421 }
3422
3423 load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3424 }
3425
3426 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3427 {
3428 Builder bld(ctx->program, ctx->block);
3429 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3430
3431 unsigned offset = nir_intrinsic_base(instr);
3432 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3433 if (index_cv && instr->dest.ssa.bit_size == 32) {
3434
3435 unsigned count = instr->dest.ssa.num_components;
3436 unsigned start = (offset + index_cv->u32) / 4u;
3437 start -= ctx->base_inline_push_consts;
3438 if (start + count <= ctx->num_inline_push_consts) {
3439 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3440 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3441 for (unsigned i = 0; i < count; ++i) {
3442 elems[i] = ctx->inline_push_consts[start + i];
3443 vec->operands[i] = Operand{elems[i]};
3444 }
3445 vec->definitions[0] = Definition(dst);
3446 ctx->block->instructions.emplace_back(std::move(vec));
3447 ctx->allocated_vec.emplace(dst.id(), elems);
3448 return;
3449 }
3450 }
3451
3452 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3453 if (offset != 0) // TODO check if index != 0 as well
3454 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3455 Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3456 Temp vec = dst;
3457 bool trim = false;
3458 aco_opcode op;
3459
3460 switch (dst.size()) {
3461 case 1:
3462 op = aco_opcode::s_load_dword;
3463 break;
3464 case 2:
3465 op = aco_opcode::s_load_dwordx2;
3466 break;
3467 case 3:
3468 vec = bld.tmp(s4);
3469 trim = true;
3470 case 4:
3471 op = aco_opcode::s_load_dwordx4;
3472 break;
3473 case 6:
3474 vec = bld.tmp(s8);
3475 trim = true;
3476 case 8:
3477 op = aco_opcode::s_load_dwordx8;
3478 break;
3479 default:
3480 unreachable("unimplemented or forbidden load_push_constant.");
3481 }
3482
3483 bld.smem(op, Definition(vec), ptr, index);
3484
3485 if (trim) {
3486 emit_split_vector(ctx, vec, 4);
3487 RegClass rc = dst.size() == 3 ? s1 : s2;
3488 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3489 emit_extract_vector(ctx, vec, 0, rc),
3490 emit_extract_vector(ctx, vec, 1, rc),
3491 emit_extract_vector(ctx, vec, 2, rc));
3492
3493 }
3494 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3495 }
3496
3497 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3498 {
3499 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3500
3501 Builder bld(ctx->program, ctx->block);
3502
3503 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3504 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3505 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3506 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3507 if (ctx->options->chip_class >= GFX10) {
3508 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3509 S_008F0C_OOB_SELECT(3) |
3510 S_008F0C_RESOURCE_LEVEL(1);
3511 } else {
3512 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3513 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3514 }
3515
3516 unsigned base = nir_intrinsic_base(instr);
3517 unsigned range = nir_intrinsic_range(instr);
3518
3519 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3520 if (base && offset.type() == RegType::sgpr)
3521 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3522 else if (base && offset.type() == RegType::vgpr)
3523 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3524
3525 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3526 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3527 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3528 Operand(desc_type));
3529
3530 load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3531 }
3532
3533 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3534 {
3535 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3536 ctx->cf_info.exec_potentially_empty = true;
3537
3538 ctx->program->needs_exact = true;
3539
3540 // TODO: optimize uniform conditions
3541 Builder bld(ctx->program, ctx->block);
3542 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3543 src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3544 bld.pseudo(aco_opcode::p_discard_if, src);
3545 ctx->block->kind |= block_kind_uses_discard_if;
3546 return;
3547 }
3548
3549 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3550 {
3551 Builder bld(ctx->program, ctx->block);
3552
3553 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3554 ctx->cf_info.exec_potentially_empty = true;
3555
3556 bool divergent = ctx->cf_info.parent_if.is_divergent ||
3557 ctx->cf_info.parent_loop.has_divergent_continue;
3558
3559 if (ctx->block->loop_nest_depth &&
3560 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3561 /* we handle discards the same way as jump instructions */
3562 append_logical_end(ctx->block);
3563
3564 /* in loops, discard behaves like break */
3565 Block *linear_target = ctx->cf_info.parent_loop.exit;
3566 ctx->block->kind |= block_kind_discard;
3567
3568 if (!divergent) {
3569 /* uniform discard - loop ends here */
3570 assert(nir_instr_is_last(&instr->instr));
3571 ctx->block->kind |= block_kind_uniform;
3572 ctx->cf_info.has_branch = true;
3573 bld.branch(aco_opcode::p_branch);
3574 add_linear_edge(ctx->block->index, linear_target);
3575 return;
3576 }
3577
3578 /* we add a break right behind the discard() instructions */
3579 ctx->block->kind |= block_kind_break;
3580 unsigned idx = ctx->block->index;
3581
3582 /* remove critical edges from linear CFG */
3583 bld.branch(aco_opcode::p_branch);
3584 Block* break_block = ctx->program->create_and_insert_block();
3585 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3586 break_block->kind |= block_kind_uniform;
3587 add_linear_edge(idx, break_block);
3588 add_linear_edge(break_block->index, linear_target);
3589 bld.reset(break_block);
3590 bld.branch(aco_opcode::p_branch);
3591
3592 Block* continue_block = ctx->program->create_and_insert_block();
3593 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3594 add_linear_edge(idx, continue_block);
3595 append_logical_start(continue_block);
3596 ctx->block = continue_block;
3597
3598 return;
3599 }
3600
3601 /* it can currently happen that NIR doesn't remove the unreachable code */
3602 if (!nir_instr_is_last(&instr->instr)) {
3603 ctx->program->needs_exact = true;
3604 /* save exec somewhere temporarily so that it doesn't get
3605 * overwritten before the discard from outer exec masks */
3606 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3607 bld.pseudo(aco_opcode::p_discard_if, cond);
3608 ctx->block->kind |= block_kind_uses_discard_if;
3609 return;
3610 }
3611
3612 /* This condition is incorrect for uniformly branched discards in a loop
3613 * predicated by a divergent condition, but the above code catches that case
3614 * and the discard would end up turning into a discard_if.
3615 * For example:
3616 * if (divergent) {
3617 * while (...) {
3618 * if (uniform) {
3619 * discard;
3620 * }
3621 * }
3622 * }
3623 */
3624 if (!ctx->cf_info.parent_if.is_divergent) {
3625 /* program just ends here */
3626 ctx->block->kind |= block_kind_uniform;
3627 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3628 0 /* enabled mask */, 9 /* dest */,
3629 false /* compressed */, true/* done */, true /* valid mask */);
3630 bld.sopp(aco_opcode::s_endpgm);
3631 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3632 } else {
3633 ctx->block->kind |= block_kind_discard;
3634 /* branch and linear edge is added by visit_if() */
3635 }
3636 }
3637
3638 enum aco_descriptor_type {
3639 ACO_DESC_IMAGE,
3640 ACO_DESC_FMASK,
3641 ACO_DESC_SAMPLER,
3642 ACO_DESC_BUFFER,
3643 ACO_DESC_PLANE_0,
3644 ACO_DESC_PLANE_1,
3645 ACO_DESC_PLANE_2,
3646 };
3647
3648 static bool
3649 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3650 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3651 return false;
3652 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3653 return dim == ac_image_cube ||
3654 dim == ac_image_1darray ||
3655 dim == ac_image_2darray ||
3656 dim == ac_image_2darraymsaa;
3657 }
3658
3659 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3660 enum aco_descriptor_type desc_type,
3661 const nir_tex_instr *tex_instr, bool image, bool write)
3662 {
3663 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3664 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3665 if (it != ctx->tex_desc.end())
3666 return it->second;
3667 */
3668 Temp index = Temp();
3669 bool index_set = false;
3670 unsigned constant_index = 0;
3671 unsigned descriptor_set;
3672 unsigned base_index;
3673 Builder bld(ctx->program, ctx->block);
3674
3675 if (!deref_instr) {
3676 assert(tex_instr && !image);
3677 descriptor_set = 0;
3678 base_index = tex_instr->sampler_index;
3679 } else {
3680 while(deref_instr->deref_type != nir_deref_type_var) {
3681 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3682 if (!array_size)
3683 array_size = 1;
3684
3685 assert(deref_instr->deref_type == nir_deref_type_array);
3686 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3687 if (const_value) {
3688 constant_index += array_size * const_value->u32;
3689 } else {
3690 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3691 if (indirect.type() == RegType::vgpr)
3692 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3693
3694 if (array_size != 1)
3695 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3696
3697 if (!index_set) {
3698 index = indirect;
3699 index_set = true;
3700 } else {
3701 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3702 }
3703 }
3704
3705 deref_instr = nir_src_as_deref(deref_instr->parent);
3706 }
3707 descriptor_set = deref_instr->var->data.descriptor_set;
3708 base_index = deref_instr->var->data.binding;
3709 }
3710
3711 Temp list = load_desc_ptr(ctx, descriptor_set);
3712 list = convert_pointer_to_64_bit(ctx, list);
3713
3714 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3715 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3716 unsigned offset = binding->offset;
3717 unsigned stride = binding->size;
3718 aco_opcode opcode;
3719 RegClass type;
3720
3721 assert(base_index < layout->binding_count);
3722
3723 switch (desc_type) {
3724 case ACO_DESC_IMAGE:
3725 type = s8;
3726 opcode = aco_opcode::s_load_dwordx8;
3727 break;
3728 case ACO_DESC_FMASK:
3729 type = s8;
3730 opcode = aco_opcode::s_load_dwordx8;
3731 offset += 32;
3732 break;
3733 case ACO_DESC_SAMPLER:
3734 type = s4;
3735 opcode = aco_opcode::s_load_dwordx4;
3736 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3737 offset += radv_combined_image_descriptor_sampler_offset(binding);
3738 break;
3739 case ACO_DESC_BUFFER:
3740 type = s4;
3741 opcode = aco_opcode::s_load_dwordx4;
3742 break;
3743 case ACO_DESC_PLANE_0:
3744 case ACO_DESC_PLANE_1:
3745 type = s8;
3746 opcode = aco_opcode::s_load_dwordx8;
3747 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3748 break;
3749 case ACO_DESC_PLANE_2:
3750 type = s4;
3751 opcode = aco_opcode::s_load_dwordx4;
3752 offset += 64;
3753 break;
3754 default:
3755 unreachable("invalid desc_type\n");
3756 }
3757
3758 offset += constant_index * stride;
3759
3760 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3761 (!index_set || binding->immutable_samplers_equal)) {
3762 if (binding->immutable_samplers_equal)
3763 constant_index = 0;
3764
3765 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3766 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3767 Operand(samplers[constant_index * 4 + 0]),
3768 Operand(samplers[constant_index * 4 + 1]),
3769 Operand(samplers[constant_index * 4 + 2]),
3770 Operand(samplers[constant_index * 4 + 3]));
3771 }
3772
3773 Operand off;
3774 if (!index_set) {
3775 off = Operand(offset);
3776 } else {
3777 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3778 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3779 }
3780
3781 Temp res = bld.smem(opcode, bld.def(type), list, off);
3782
3783 if (desc_type == ACO_DESC_PLANE_2) {
3784 Temp components[8];
3785 for (unsigned i = 0; i < 8; i++)
3786 components[i] = bld.tmp(s1);
3787 bld.pseudo(aco_opcode::p_split_vector,
3788 Definition(components[0]),
3789 Definition(components[1]),
3790 Definition(components[2]),
3791 Definition(components[3]),
3792 res);
3793
3794 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3795 bld.pseudo(aco_opcode::p_split_vector,
3796 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3797 Definition(components[4]),
3798 Definition(components[5]),
3799 Definition(components[6]),
3800 Definition(components[7]),
3801 desc2);
3802
3803 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3804 components[0], components[1], components[2], components[3],
3805 components[4], components[5], components[6], components[7]);
3806 }
3807
3808 return res;
3809 }
3810
3811 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3812 {
3813 switch (dim) {
3814 case GLSL_SAMPLER_DIM_BUF:
3815 return 1;
3816 case GLSL_SAMPLER_DIM_1D:
3817 return array ? 2 : 1;
3818 case GLSL_SAMPLER_DIM_2D:
3819 return array ? 3 : 2;
3820 case GLSL_SAMPLER_DIM_MS:
3821 return array ? 4 : 3;
3822 case GLSL_SAMPLER_DIM_3D:
3823 case GLSL_SAMPLER_DIM_CUBE:
3824 return 3;
3825 case GLSL_SAMPLER_DIM_RECT:
3826 case GLSL_SAMPLER_DIM_SUBPASS:
3827 return 2;
3828 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3829 return 3;
3830 default:
3831 break;
3832 }
3833 return 0;
3834 }
3835
3836
3837 /* Adjust the sample index according to FMASK.
3838 *
3839 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3840 * which is the identity mapping. Each nibble says which physical sample
3841 * should be fetched to get that sample.
3842 *
3843 * For example, 0x11111100 means there are only 2 samples stored and
3844 * the second sample covers 3/4 of the pixel. When reading samples 0
3845 * and 1, return physical sample 0 (determined by the first two 0s
3846 * in FMASK), otherwise return physical sample 1.
3847 *
3848 * The sample index should be adjusted as follows:
3849 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
3850 */
3851 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3852 {
3853 Builder bld(ctx->program, ctx->block);
3854 Temp fmask = bld.tmp(v1);
3855 unsigned dim = ctx->options->chip_class >= GFX10
3856 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3857 : 0;
3858
3859 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3860 load->operands[0] = Operand(coords);
3861 load->operands[1] = Operand(fmask_desc_ptr);
3862 load->definitions[0] = Definition(fmask);
3863 load->glc = false;
3864 load->dlc = false;
3865 load->dmask = 0x1;
3866 load->unrm = true;
3867 load->da = da;
3868 load->dim = dim;
3869 load->can_reorder = true; /* fmask images shouldn't be modified */
3870 ctx->block->instructions.emplace_back(std::move(load));
3871
3872 Operand sample_index4;
3873 if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3874 sample_index4 = Operand(sample_index.constantValue() << 2);
3875 } else if (sample_index.regClass() == s1) {
3876 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3877 } else {
3878 assert(sample_index.regClass() == v1);
3879 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3880 }
3881
3882 Temp final_sample;
3883 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3884 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3885 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3886 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3887 else
3888 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3889
3890 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3891 * resource descriptor is 0 (invalid),
3892 */
3893 Temp compare = bld.tmp(s2);
3894 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3895 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3896
3897 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3898
3899 /* Replace the MSAA sample index. */
3900 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3901 }
3902
3903 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3904 {
3905
3906 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3907 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3908 bool is_array = glsl_sampler_type_is_array(type);
3909 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3910 assert(!add_frag_pos && "Input attachments should be lowered.");
3911 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3912 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3913 int count = image_type_to_components_count(dim, is_array);
3914 std::vector<Operand> coords(count);
3915
3916 if (is_ms) {
3917 Operand sample_index;
3918 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3919 if (sample_cv)
3920 sample_index = Operand(sample_cv->u32);
3921 else
3922 sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3923
3924 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3925 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3926 for (unsigned i = 0; i < vec->operands.size(); i++)
3927 vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3928 Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3929 vec->definitions[0] = Definition(fmask_load_address);
3930 ctx->block->instructions.emplace_back(std::move(vec));
3931
3932 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3933 sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3934 }
3935 count--;
3936 coords[count] = sample_index;
3937 }
3938
3939 if (count == 1 && !gfx9_1d)
3940 return emit_extract_vector(ctx, src0, 0, v1);
3941
3942 if (gfx9_1d) {
3943 coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3944 coords.resize(coords.size() + 1);
3945 coords[1] = Operand((uint32_t) 0);
3946 if (is_array)
3947 coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3948 } else {
3949 for (int i = 0; i < count; i++)
3950 coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3951 }
3952
3953 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3954 for (unsigned i = 0; i < coords.size(); i++)
3955 vec->operands[i] = coords[i];
3956 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3957 vec->definitions[0] = Definition(res);
3958 ctx->block->instructions.emplace_back(std::move(vec));
3959 return res;
3960 }
3961
3962
3963 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3964 {
3965 Builder bld(ctx->program, ctx->block);
3966 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3967 const struct glsl_type *type = glsl_without_array(var->type);
3968 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3969 bool is_array = glsl_sampler_type_is_array(type);
3970 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3971
3972 if (dim == GLSL_SAMPLER_DIM_BUF) {
3973 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3974 unsigned num_channels = util_last_bit(mask);
3975 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3976 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3977
3978 aco_opcode opcode;
3979 switch (num_channels) {
3980 case 1:
3981 opcode = aco_opcode::buffer_load_format_x;
3982 break;
3983 case 2:
3984 opcode = aco_opcode::buffer_load_format_xy;
3985 break;
3986 case 3:
3987 opcode = aco_opcode::buffer_load_format_xyz;
3988 break;
3989 case 4:
3990 opcode = aco_opcode::buffer_load_format_xyzw;
3991 break;
3992 default:
3993 unreachable(">4 channel buffer image load");
3994 }
3995 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3996 load->operands[0] = Operand(vindex);
3997 load->operands[1] = Operand(rsrc);
3998 load->operands[2] = Operand((uint32_t) 0);
3999 Temp tmp;
4000 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4001 tmp = dst;
4002 else
4003 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4004 load->definitions[0] = Definition(tmp);
4005 load->idxen = true;
4006 load->barrier = barrier_image;
4007 ctx->block->instructions.emplace_back(std::move(load));
4008
4009 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4010 return;
4011 }
4012
4013 Temp coords = get_image_coords(ctx, instr, type);
4014 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4015
4016 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4017 unsigned num_components = util_bitcount(dmask);
4018 Temp tmp;
4019 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4020 tmp = dst;
4021 else
4022 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4023
4024 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
4025 load->operands[0] = Operand(coords);
4026 load->operands[1] = Operand(resource);
4027 load->definitions[0] = Definition(tmp);
4028 load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4029 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4030 load->dmask = dmask;
4031 load->unrm = true;
4032 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4033 load->barrier = barrier_image;
4034 ctx->block->instructions.emplace_back(std::move(load));
4035
4036 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4037 return;
4038 }
4039
4040 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4041 {
4042 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4043 const struct glsl_type *type = glsl_without_array(var->type);
4044 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4045 bool is_array = glsl_sampler_type_is_array(type);
4046 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4047
4048 bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4049
4050 if (dim == GLSL_SAMPLER_DIM_BUF) {
4051 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4052 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4053 aco_opcode opcode;
4054 switch (data.size()) {
4055 case 1:
4056 opcode = aco_opcode::buffer_store_format_x;
4057 break;
4058 case 2:
4059 opcode = aco_opcode::buffer_store_format_xy;
4060 break;
4061 case 3:
4062 opcode = aco_opcode::buffer_store_format_xyz;
4063 break;
4064 case 4:
4065 opcode = aco_opcode::buffer_store_format_xyzw;
4066 break;
4067 default:
4068 unreachable(">4 channel buffer image store");
4069 }
4070 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4071 store->operands[0] = Operand(vindex);
4072 store->operands[1] = Operand(rsrc);
4073 store->operands[2] = Operand((uint32_t) 0);
4074 store->operands[3] = Operand(data);
4075 store->idxen = true;
4076 store->glc = glc;
4077 store->dlc = false;
4078 store->disable_wqm = true;
4079 store->barrier = barrier_image;
4080 ctx->program->needs_exact = true;
4081 ctx->block->instructions.emplace_back(std::move(store));
4082 return;
4083 }
4084
4085 assert(data.type() == RegType::vgpr);
4086 Temp coords = get_image_coords(ctx, instr, type);
4087 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4088
4089 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
4090 store->operands[0] = Operand(coords);
4091 store->operands[1] = Operand(resource);
4092 store->operands[2] = Operand(s4);
4093 store->operands[3] = Operand(data);
4094 store->glc = glc;
4095 store->dlc = false;
4096 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4097 store->dmask = (1 << data.size()) - 1;
4098 store->unrm = true;
4099 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4100 store->disable_wqm = true;
4101 store->barrier = barrier_image;
4102 ctx->program->needs_exact = true;
4103 ctx->block->instructions.emplace_back(std::move(store));
4104 return;
4105 }
4106
4107 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4108 {
4109 /* return the previous value if dest is ever used */
4110 bool return_previous = false;
4111 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4112 return_previous = true;
4113 break;
4114 }
4115 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4116 return_previous = true;
4117 break;
4118 }
4119
4120 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4121 const struct glsl_type *type = glsl_without_array(var->type);
4122 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4123 bool is_array = glsl_sampler_type_is_array(type);
4124 Builder bld(ctx->program, ctx->block);
4125
4126 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4127 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4128
4129 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4130 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4131
4132 aco_opcode buf_op, image_op;
4133 switch (instr->intrinsic) {
4134 case nir_intrinsic_image_deref_atomic_add:
4135 buf_op = aco_opcode::buffer_atomic_add;
4136 image_op = aco_opcode::image_atomic_add;
4137 break;
4138 case nir_intrinsic_image_deref_atomic_umin:
4139 buf_op = aco_opcode::buffer_atomic_umin;
4140 image_op = aco_opcode::image_atomic_umin;
4141 break;
4142 case nir_intrinsic_image_deref_atomic_imin:
4143 buf_op = aco_opcode::buffer_atomic_smin;
4144 image_op = aco_opcode::image_atomic_smin;
4145 break;
4146 case nir_intrinsic_image_deref_atomic_umax:
4147 buf_op = aco_opcode::buffer_atomic_umax;
4148 image_op = aco_opcode::image_atomic_umax;
4149 break;
4150 case nir_intrinsic_image_deref_atomic_imax:
4151 buf_op = aco_opcode::buffer_atomic_smax;
4152 image_op = aco_opcode::image_atomic_smax;
4153 break;
4154 case nir_intrinsic_image_deref_atomic_and:
4155 buf_op = aco_opcode::buffer_atomic_and;
4156 image_op = aco_opcode::image_atomic_and;
4157 break;
4158 case nir_intrinsic_image_deref_atomic_or:
4159 buf_op = aco_opcode::buffer_atomic_or;
4160 image_op = aco_opcode::image_atomic_or;
4161 break;
4162 case nir_intrinsic_image_deref_atomic_xor:
4163 buf_op = aco_opcode::buffer_atomic_xor;
4164 image_op = aco_opcode::image_atomic_xor;
4165 break;
4166 case nir_intrinsic_image_deref_atomic_exchange:
4167 buf_op = aco_opcode::buffer_atomic_swap;
4168 image_op = aco_opcode::image_atomic_swap;
4169 break;
4170 case nir_intrinsic_image_deref_atomic_comp_swap:
4171 buf_op = aco_opcode::buffer_atomic_cmpswap;
4172 image_op = aco_opcode::image_atomic_cmpswap;
4173 break;
4174 default:
4175 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4176 }
4177
4178 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4179
4180 if (dim == GLSL_SAMPLER_DIM_BUF) {
4181 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4182 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4183 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
4184 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4185 mubuf->operands[0] = Operand(vindex);
4186 mubuf->operands[1] = Operand(resource);
4187 mubuf->operands[2] = Operand((uint32_t)0);
4188 mubuf->operands[3] = Operand(data);
4189 if (return_previous)
4190 mubuf->definitions[0] = Definition(dst);
4191 mubuf->offset = 0;
4192 mubuf->idxen = true;
4193 mubuf->glc = return_previous;
4194 mubuf->dlc = false; /* Not needed for atomics */
4195 mubuf->disable_wqm = true;
4196 mubuf->barrier = barrier_image;
4197 ctx->program->needs_exact = true;
4198 ctx->block->instructions.emplace_back(std::move(mubuf));
4199 return;
4200 }
4201
4202 Temp coords = get_image_coords(ctx, instr, type);
4203 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4204 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
4205 mimg->operands[0] = Operand(coords);
4206 mimg->operands[1] = Operand(resource);
4207 mimg->operands[2] = Operand(s4); /* no sampler */
4208 mimg->operands[3] = Operand(data);
4209 if (return_previous)
4210 mimg->definitions[0] = Definition(dst);
4211 mimg->glc = return_previous;
4212 mimg->dlc = false; /* Not needed for atomics */
4213 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4214 mimg->dmask = (1 << data.size()) - 1;
4215 mimg->unrm = true;
4216 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4217 mimg->disable_wqm = true;
4218 mimg->barrier = barrier_image;
4219 ctx->program->needs_exact = true;
4220 ctx->block->instructions.emplace_back(std::move(mimg));
4221 return;
4222 }
4223
4224 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
4225 {
4226 if (in_elements && ctx->options->chip_class == GFX8) {
4227 Builder bld(ctx->program, ctx->block);
4228
4229 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
4230 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
4231 stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
4232 stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
4233
4234 Temp size = emit_extract_vector(ctx, desc, 2, s1);
4235 size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
4236
4237 Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4238 res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4239 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4240
4241 // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4242 /* idea
4243 * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4244 * in case 12 (or 3?), we have to divide by 3:
4245 * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4246 * use v_mul_hi_u32 with magic number to divide
4247 * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4248 * disable v_skip
4249 * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4250 */
4251
4252 } else {
4253 emit_extract_vector(ctx, desc, 2, dst);
4254 }
4255 }
4256
4257 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4258 {
4259 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4260 const struct glsl_type *type = glsl_without_array(var->type);
4261 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4262 bool is_array = glsl_sampler_type_is_array(type);
4263 Builder bld(ctx->program, ctx->block);
4264
4265 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4266 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4267 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4268 }
4269
4270 /* LOD */
4271 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4272
4273 /* Resource */
4274 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4275
4276 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4277
4278 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4279 mimg->operands[0] = Operand(lod);
4280 mimg->operands[1] = Operand(resource);
4281 unsigned& dmask = mimg->dmask;
4282 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4283 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4284 mimg->da = glsl_sampler_type_is_array(type);
4285 mimg->can_reorder = true;
4286 Definition& def = mimg->definitions[0];
4287 ctx->block->instructions.emplace_back(std::move(mimg));
4288
4289 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4290 glsl_sampler_type_is_array(type)) {
4291
4292 assert(instr->dest.ssa.num_components == 3);
4293 Temp tmp = {ctx->program->allocateId(), v3};
4294 def = Definition(tmp);
4295 emit_split_vector(ctx, tmp, 3);
4296
4297 /* divide 3rd value by 6 by multiplying with magic number */
4298 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4299 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4300
4301 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4302 emit_extract_vector(ctx, tmp, 0, v1),
4303 emit_extract_vector(ctx, tmp, 1, v1),
4304 by_6);
4305
4306 } else if (ctx->options->chip_class == GFX9 &&
4307 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4308 glsl_sampler_type_is_array(type)) {
4309 assert(instr->dest.ssa.num_components == 2);
4310 def = Definition(dst);
4311 dmask = 0x5;
4312 } else {
4313 def = Definition(dst);
4314 }
4315
4316 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4317 }
4318
4319 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4320 {
4321 Builder bld(ctx->program, ctx->block);
4322 unsigned num_components = instr->num_components;
4323
4324 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4325 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4326 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4327
4328 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4329 load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4330 }
4331
4332 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4333 {
4334 Builder bld(ctx->program, ctx->block);
4335 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4336 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4337 unsigned writemask = nir_intrinsic_write_mask(instr);
4338
4339 Temp offset;
4340 if (ctx->options->chip_class < GFX8)
4341 offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4342 else
4343 offset = get_ssa_temp(ctx, instr->src[2].ssa);
4344
4345 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4346 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4347
4348 bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4349 ctx->options->chip_class >= GFX8;
4350 if (smem)
4351 offset = bld.as_uniform(offset);
4352 bool smem_nonfs = smem && ctx->stage != fragment_fs;
4353
4354 while (writemask) {
4355 int start, count;
4356 u_bit_scan_consecutive_range(&writemask, &start, &count);
4357 if (count == 3 && smem) {
4358 writemask |= 1u << (start + 2);
4359 count = 2;
4360 }
4361 int num_bytes = count * elem_size_bytes;
4362
4363 if (num_bytes > 16) {
4364 assert(elem_size_bytes == 8);
4365 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4366 count = 2;
4367 num_bytes = 16;
4368 }
4369
4370 // TODO: check alignment of sub-dword stores
4371 // TODO: split 3 bytes. there is no store instruction for that
4372
4373 Temp write_data;
4374 if (count != instr->num_components) {
4375 emit_split_vector(ctx, data, instr->num_components);
4376 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4377 for (int i = 0; i < count; i++) {
4378 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4379 vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4380 }
4381 write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4382 vec->definitions[0] = Definition(write_data);
4383 ctx->block->instructions.emplace_back(std::move(vec));
4384 } else if (!smem && data.type() != RegType::vgpr) {
4385 assert(num_bytes % 4 == 0);
4386 write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4387 } else if (smem_nonfs && data.type() == RegType::vgpr) {
4388 assert(num_bytes % 4 == 0);
4389 write_data = bld.as_uniform(data);
4390 } else {
4391 write_data = data;
4392 }
4393
4394 aco_opcode vmem_op, smem_op;
4395 switch (num_bytes) {
4396 case 4:
4397 vmem_op = aco_opcode::buffer_store_dword;
4398 smem_op = aco_opcode::s_buffer_store_dword;
4399 break;
4400 case 8:
4401 vmem_op = aco_opcode::buffer_store_dwordx2;
4402 smem_op = aco_opcode::s_buffer_store_dwordx2;
4403 break;
4404 case 12:
4405 vmem_op = aco_opcode::buffer_store_dwordx3;
4406 smem_op = aco_opcode::last_opcode;
4407 assert(!smem);
4408 break;
4409 case 16:
4410 vmem_op = aco_opcode::buffer_store_dwordx4;
4411 smem_op = aco_opcode::s_buffer_store_dwordx4;
4412 break;
4413 default:
4414 unreachable("Store SSBO not implemented for this size.");
4415 }
4416 if (ctx->stage == fragment_fs)
4417 smem_op = aco_opcode::p_fs_buffer_store_smem;
4418
4419 if (smem) {
4420 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4421 store->operands[0] = Operand(rsrc);
4422 if (start) {
4423 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4424 offset, Operand(start * elem_size_bytes));
4425 store->operands[1] = Operand(off);
4426 } else {
4427 store->operands[1] = Operand(offset);
4428 }
4429 if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4430 store->operands[1].setFixed(m0);
4431 store->operands[2] = Operand(write_data);
4432 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4433 store->dlc = false;
4434 store->disable_wqm = true;
4435 store->barrier = barrier_buffer;
4436 ctx->block->instructions.emplace_back(std::move(store));
4437 ctx->program->wb_smem_l1_on_end = true;
4438 if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4439 ctx->block->kind |= block_kind_needs_lowering;
4440 ctx->program->needs_exact = true;
4441 }
4442 } else {
4443 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4444 store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4445 store->operands[1] = Operand(rsrc);
4446 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4447 store->operands[3] = Operand(write_data);
4448 store->offset = start * elem_size_bytes;
4449 store->offen = (offset.type() == RegType::vgpr);
4450 store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4451 store->dlc = false;
4452 store->disable_wqm = true;
4453 store->barrier = barrier_buffer;
4454 ctx->program->needs_exact = true;
4455 ctx->block->instructions.emplace_back(std::move(store));
4456 }
4457 }
4458 }
4459
4460 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4461 {
4462 /* return the previous value if dest is ever used */
4463 bool return_previous = false;
4464 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4465 return_previous = true;
4466 break;
4467 }
4468 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4469 return_previous = true;
4470 break;
4471 }
4472
4473 Builder bld(ctx->program, ctx->block);
4474 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4475
4476 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4477 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4478 get_ssa_temp(ctx, instr->src[3].ssa), data);
4479
4480 Temp offset;
4481 if (ctx->options->chip_class < GFX8)
4482 offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4483 else
4484 offset = get_ssa_temp(ctx, instr->src[1].ssa);
4485
4486 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4487 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4488
4489 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4490
4491 aco_opcode op32, op64;
4492 switch (instr->intrinsic) {
4493 case nir_intrinsic_ssbo_atomic_add:
4494 op32 = aco_opcode::buffer_atomic_add;
4495 op64 = aco_opcode::buffer_atomic_add_x2;
4496 break;
4497 case nir_intrinsic_ssbo_atomic_imin:
4498 op32 = aco_opcode::buffer_atomic_smin;
4499 op64 = aco_opcode::buffer_atomic_smin_x2;
4500 break;
4501 case nir_intrinsic_ssbo_atomic_umin:
4502 op32 = aco_opcode::buffer_atomic_umin;
4503 op64 = aco_opcode::buffer_atomic_umin_x2;
4504 break;
4505 case nir_intrinsic_ssbo_atomic_imax:
4506 op32 = aco_opcode::buffer_atomic_smax;
4507 op64 = aco_opcode::buffer_atomic_smax_x2;
4508 break;
4509 case nir_intrinsic_ssbo_atomic_umax:
4510 op32 = aco_opcode::buffer_atomic_umax;
4511 op64 = aco_opcode::buffer_atomic_umax_x2;
4512 break;
4513 case nir_intrinsic_ssbo_atomic_and:
4514 op32 = aco_opcode::buffer_atomic_and;
4515 op64 = aco_opcode::buffer_atomic_and_x2;
4516 break;
4517 case nir_intrinsic_ssbo_atomic_or:
4518 op32 = aco_opcode::buffer_atomic_or;
4519 op64 = aco_opcode::buffer_atomic_or_x2;
4520 break;
4521 case nir_intrinsic_ssbo_atomic_xor:
4522 op32 = aco_opcode::buffer_atomic_xor;
4523 op64 = aco_opcode::buffer_atomic_xor_x2;
4524 break;
4525 case nir_intrinsic_ssbo_atomic_exchange:
4526 op32 = aco_opcode::buffer_atomic_swap;
4527 op64 = aco_opcode::buffer_atomic_swap_x2;
4528 break;
4529 case nir_intrinsic_ssbo_atomic_comp_swap:
4530 op32 = aco_opcode::buffer_atomic_cmpswap;
4531 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4532 break;
4533 default:
4534 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4535 }
4536 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4537 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4538 mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4539 mubuf->operands[1] = Operand(rsrc);
4540 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4541 mubuf->operands[3] = Operand(data);
4542 if (return_previous)
4543 mubuf->definitions[0] = Definition(dst);
4544 mubuf->offset = 0;
4545 mubuf->offen = (offset.type() == RegType::vgpr);
4546 mubuf->glc = return_previous;
4547 mubuf->dlc = false; /* Not needed for atomics */
4548 mubuf->disable_wqm = true;
4549 mubuf->barrier = barrier_buffer;
4550 ctx->program->needs_exact = true;
4551 ctx->block->instructions.emplace_back(std::move(mubuf));
4552 }
4553
4554 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4555
4556 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4557 Builder bld(ctx->program, ctx->block);
4558 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4559 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4560 }
4561
4562 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4563 {
4564 Builder bld(ctx->program, ctx->block);
4565 unsigned num_components = instr->num_components;
4566 unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4567
4568 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4569 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4570
4571 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4572 bool dlc = glc && ctx->options->chip_class >= GFX10;
4573 aco_opcode op;
4574 if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4575 bool global = ctx->options->chip_class >= GFX9;
4576 aco_opcode op;
4577 switch (num_bytes) {
4578 case 4:
4579 op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4580 break;
4581 case 8:
4582 op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4583 break;
4584 case 12:
4585 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4586 break;
4587 case 16:
4588 op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4589 break;
4590 default:
4591 unreachable("load_global not implemented for this size.");
4592 }
4593 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4594 flat->operands[0] = Operand(addr);
4595 flat->operands[1] = Operand(s1);
4596 flat->glc = glc;
4597 flat->dlc = dlc;
4598
4599 if (dst.type() == RegType::sgpr) {
4600 Temp vec = bld.tmp(RegType::vgpr, dst.size());
4601 flat->definitions[0] = Definition(vec);
4602 ctx->block->instructions.emplace_back(std::move(flat));
4603 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4604 } else {
4605 flat->definitions[0] = Definition(dst);
4606 ctx->block->instructions.emplace_back(std::move(flat));
4607 }
4608 emit_split_vector(ctx, dst, num_components);
4609 } else {
4610 switch (num_bytes) {
4611 case 4:
4612 op = aco_opcode::s_load_dword;
4613 break;
4614 case 8:
4615 op = aco_opcode::s_load_dwordx2;
4616 break;
4617 case 12:
4618 case 16:
4619 op = aco_opcode::s_load_dwordx4;
4620 break;
4621 default:
4622 unreachable("load_global not implemented for this size.");
4623 }
4624 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4625 load->operands[0] = Operand(addr);
4626 load->operands[1] = Operand(0u);
4627 load->definitions[0] = Definition(dst);
4628 load->glc = glc;
4629 load->dlc = dlc;
4630 load->barrier = barrier_buffer;
4631 assert(ctx->options->chip_class >= GFX8 || !glc);
4632
4633 if (dst.size() == 3) {
4634 /* trim vector */
4635 Temp vec = bld.tmp(s4);
4636 load->definitions[0] = Definition(vec);
4637 ctx->block->instructions.emplace_back(std::move(load));
4638 emit_split_vector(ctx, vec, 4);
4639
4640 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4641 emit_extract_vector(ctx, vec, 0, s1),
4642 emit_extract_vector(ctx, vec, 1, s1),
4643 emit_extract_vector(ctx, vec, 2, s1));
4644 } else {
4645 ctx->block->instructions.emplace_back(std::move(load));
4646 }
4647 }
4648 }
4649
4650 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4651 {
4652 Builder bld(ctx->program, ctx->block);
4653 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4654
4655 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4656 Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4657
4658 unsigned writemask = nir_intrinsic_write_mask(instr);
4659 while (writemask) {
4660 int start, count;
4661 u_bit_scan_consecutive_range(&writemask, &start, &count);
4662 unsigned num_bytes = count * elem_size_bytes;
4663
4664 Temp write_data = data;
4665 if (count != instr->num_components) {
4666 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4667 for (int i = 0; i < count; i++)
4668 vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4669 write_data = bld.tmp(RegType::vgpr, count);
4670 vec->definitions[0] = Definition(write_data);
4671 ctx->block->instructions.emplace_back(std::move(vec));
4672 }
4673
4674 unsigned offset = start * elem_size_bytes;
4675 if (offset > 0 && ctx->options->chip_class < GFX9) {
4676 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4677 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4678 Temp carry = bld.tmp(s2);
4679 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4680
4681 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4682 Operand(offset), addr0);
4683 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4684 Operand(0u), addr1,
4685 carry).def(1).setHint(vcc);
4686
4687 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4688
4689 offset = 0;
4690 }
4691
4692 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4693 bool global = ctx->options->chip_class >= GFX9;
4694 aco_opcode op;
4695 switch (num_bytes) {
4696 case 4:
4697 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4698 break;
4699 case 8:
4700 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4701 break;
4702 case 12:
4703 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4704 break;
4705 case 16:
4706 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4707 break;
4708 default:
4709 unreachable("store_global not implemented for this size.");
4710 }
4711 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4712 flat->operands[0] = Operand(addr);
4713 flat->operands[1] = Operand(s1);
4714 flat->operands[2] = Operand(data);
4715 flat->glc = glc;
4716 flat->dlc = false;
4717 flat->offset = offset;
4718 ctx->block->instructions.emplace_back(std::move(flat));
4719 }
4720 }
4721
4722 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4723 Builder bld(ctx->program, ctx->block);
4724 switch(instr->intrinsic) {
4725 case nir_intrinsic_group_memory_barrier:
4726 case nir_intrinsic_memory_barrier:
4727 bld.barrier(aco_opcode::p_memory_barrier_all);
4728 break;
4729 case nir_intrinsic_memory_barrier_atomic_counter:
4730 bld.barrier(aco_opcode::p_memory_barrier_atomic);
4731 break;
4732 case nir_intrinsic_memory_barrier_buffer:
4733 bld.barrier(aco_opcode::p_memory_barrier_buffer);
4734 break;
4735 case nir_intrinsic_memory_barrier_image:
4736 bld.barrier(aco_opcode::p_memory_barrier_image);
4737 break;
4738 case nir_intrinsic_memory_barrier_shared:
4739 bld.barrier(aco_opcode::p_memory_barrier_shared);
4740 break;
4741 default:
4742 unreachable("Unimplemented memory barrier intrinsic");
4743 break;
4744 }
4745 }
4746
4747 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4748 {
4749 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4750 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4751 assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4752 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4753 Builder bld(ctx->program, ctx->block);
4754
4755 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4756 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4757 load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
4758 }
4759
4760 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4761 {
4762 unsigned writemask = nir_intrinsic_write_mask(instr);
4763 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4764 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4765 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4766 assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4767
4768 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4769 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
4770 }
4771
4772 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4773 {
4774 unsigned offset = nir_intrinsic_base(instr);
4775 Operand m = load_lds_size_m0(ctx);
4776 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4777 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4778
4779 unsigned num_operands = 3;
4780 aco_opcode op32, op64, op32_rtn, op64_rtn;
4781 switch(instr->intrinsic) {
4782 case nir_intrinsic_shared_atomic_add:
4783 op32 = aco_opcode::ds_add_u32;
4784 op64 = aco_opcode::ds_add_u64;
4785 op32_rtn = aco_opcode::ds_add_rtn_u32;
4786 op64_rtn = aco_opcode::ds_add_rtn_u64;
4787 break;
4788 case nir_intrinsic_shared_atomic_imin:
4789 op32 = aco_opcode::ds_min_i32;
4790 op64 = aco_opcode::ds_min_i64;
4791 op32_rtn = aco_opcode::ds_min_rtn_i32;
4792 op64_rtn = aco_opcode::ds_min_rtn_i64;
4793 break;
4794 case nir_intrinsic_shared_atomic_umin:
4795 op32 = aco_opcode::ds_min_u32;
4796 op64 = aco_opcode::ds_min_u64;
4797 op32_rtn = aco_opcode::ds_min_rtn_u32;
4798 op64_rtn = aco_opcode::ds_min_rtn_u64;
4799 break;
4800 case nir_intrinsic_shared_atomic_imax:
4801 op32 = aco_opcode::ds_max_i32;
4802 op64 = aco_opcode::ds_max_i64;
4803 op32_rtn = aco_opcode::ds_max_rtn_i32;
4804 op64_rtn = aco_opcode::ds_max_rtn_i64;
4805 break;
4806 case nir_intrinsic_shared_atomic_umax:
4807 op32 = aco_opcode::ds_max_u32;
4808 op64 = aco_opcode::ds_max_u64;
4809 op32_rtn = aco_opcode::ds_max_rtn_u32;
4810 op64_rtn = aco_opcode::ds_max_rtn_u64;
4811 break;
4812 case nir_intrinsic_shared_atomic_and:
4813 op32 = aco_opcode::ds_and_b32;
4814 op64 = aco_opcode::ds_and_b64;
4815 op32_rtn = aco_opcode::ds_and_rtn_b32;
4816 op64_rtn = aco_opcode::ds_and_rtn_b64;
4817 break;
4818 case nir_intrinsic_shared_atomic_or:
4819 op32 = aco_opcode::ds_or_b32;
4820 op64 = aco_opcode::ds_or_b64;
4821 op32_rtn = aco_opcode::ds_or_rtn_b32;
4822 op64_rtn = aco_opcode::ds_or_rtn_b64;
4823 break;
4824 case nir_intrinsic_shared_atomic_xor:
4825 op32 = aco_opcode::ds_xor_b32;
4826 op64 = aco_opcode::ds_xor_b64;
4827 op32_rtn = aco_opcode::ds_xor_rtn_b32;
4828 op64_rtn = aco_opcode::ds_xor_rtn_b64;
4829 break;
4830 case nir_intrinsic_shared_atomic_exchange:
4831 op32 = aco_opcode::ds_write_b32;
4832 op64 = aco_opcode::ds_write_b64;
4833 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4834 op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4835 break;
4836 case nir_intrinsic_shared_atomic_comp_swap:
4837 op32 = aco_opcode::ds_cmpst_b32;
4838 op64 = aco_opcode::ds_cmpst_b64;
4839 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4840 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4841 num_operands = 4;
4842 break;
4843 default:
4844 unreachable("Unhandled shared atomic intrinsic");
4845 }
4846
4847 /* return the previous value if dest is ever used */
4848 bool return_previous = false;
4849 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4850 return_previous = true;
4851 break;
4852 }
4853 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4854 return_previous = true;
4855 break;
4856 }
4857
4858 aco_opcode op;
4859 if (data.size() == 1) {
4860 assert(instr->dest.ssa.bit_size == 32);
4861 op = return_previous ? op32_rtn : op32;
4862 } else {
4863 assert(instr->dest.ssa.bit_size == 64);
4864 op = return_previous ? op64_rtn : op64;
4865 }
4866
4867 if (offset > 65535) {
4868 Builder bld(ctx->program, ctx->block);
4869 address = bld.vadd32(bld.def(v1), Operand(offset), address);
4870 offset = 0;
4871 }
4872
4873 aco_ptr<DS_instruction> ds;
4874 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4875 ds->operands[0] = Operand(address);
4876 ds->operands[1] = Operand(data);
4877 if (num_operands == 4)
4878 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4879 ds->operands[num_operands - 1] = m;
4880 ds->offset0 = offset;
4881 if (return_previous)
4882 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4883 ctx->block->instructions.emplace_back(std::move(ds));
4884 }
4885
4886 Temp get_scratch_resource(isel_context *ctx)
4887 {
4888 Builder bld(ctx->program, ctx->block);
4889 Temp scratch_addr = ctx->private_segment_buffer;
4890 if (ctx->stage != compute_cs)
4891 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4892
4893 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
4894 S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
4895
4896 if (ctx->program->chip_class >= GFX10) {
4897 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4898 S_008F0C_OOB_SELECT(3) |
4899 S_008F0C_RESOURCE_LEVEL(1);
4900 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
4901 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4902 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4903 }
4904
4905 /* older generations need element size = 16 bytes. element size removed in GFX9 */
4906 if (ctx->program->chip_class <= GFX8)
4907 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
4908
4909 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4910 }
4911
4912 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4913 assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4914 Builder bld(ctx->program, ctx->block);
4915 Temp rsrc = get_scratch_resource(ctx);
4916 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4917 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4918
4919 aco_opcode op;
4920 switch (dst.size()) {
4921 case 1:
4922 op = aco_opcode::buffer_load_dword;
4923 break;
4924 case 2:
4925 op = aco_opcode::buffer_load_dwordx2;
4926 break;
4927 case 3:
4928 op = aco_opcode::buffer_load_dwordx3;
4929 break;
4930 case 4:
4931 op = aco_opcode::buffer_load_dwordx4;
4932 break;
4933 case 6:
4934 case 8: {
4935 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4936 Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4937 bld.def(v4), offset, rsrc,
4938 ctx->scratch_offset, 0, true);
4939 Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4940 aco_opcode::buffer_load_dwordx4,
4941 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4942 offset, rsrc, ctx->scratch_offset, 16, true);
4943 emit_split_vector(ctx, lower, 2);
4944 elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4945 elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4946 if (dst.size() == 8) {
4947 emit_split_vector(ctx, upper, 2);
4948 elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4949 elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4950 } else {
4951 elems[2] = upper;
4952 }
4953
4954 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4955 Format::PSEUDO, dst.size() / 2, 1)};
4956 for (unsigned i = 0; i < dst.size() / 2; i++)
4957 vec->operands[i] = Operand(elems[i]);
4958 vec->definitions[0] = Definition(dst);
4959 bld.insert(std::move(vec));
4960 ctx->allocated_vec.emplace(dst.id(), elems);
4961 return;
4962 }
4963 default:
4964 unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4965 }
4966
4967 bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4968 emit_split_vector(ctx, dst, instr->num_components);
4969 }
4970
4971 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4972 assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4973 Builder bld(ctx->program, ctx->block);
4974 Temp rsrc = get_scratch_resource(ctx);
4975 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4976 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4977
4978 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4979 unsigned writemask = nir_intrinsic_write_mask(instr);
4980
4981 while (writemask) {
4982 int start, count;
4983 u_bit_scan_consecutive_range(&writemask, &start, &count);
4984 int num_bytes = count * elem_size_bytes;
4985
4986 if (num_bytes > 16) {
4987 assert(elem_size_bytes == 8);
4988 writemask |= (((count - 2) << 1) - 1) << (start + 2);
4989 count = 2;
4990 num_bytes = 16;
4991 }
4992
4993 // TODO: check alignment of sub-dword stores
4994 // TODO: split 3 bytes. there is no store instruction for that
4995
4996 Temp write_data;
4997 if (count != instr->num_components) {
4998 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4999 for (int i = 0; i < count; i++) {
5000 Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
5001 vec->operands[i] = Operand(elem);
5002 }
5003 write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
5004 vec->definitions[0] = Definition(write_data);
5005 ctx->block->instructions.emplace_back(std::move(vec));
5006 } else {
5007 write_data = data;
5008 }
5009
5010 aco_opcode op;
5011 switch (num_bytes) {
5012 case 4:
5013 op = aco_opcode::buffer_store_dword;
5014 break;
5015 case 8:
5016 op = aco_opcode::buffer_store_dwordx2;
5017 break;
5018 case 12:
5019 op = aco_opcode::buffer_store_dwordx3;
5020 break;
5021 case 16:
5022 op = aco_opcode::buffer_store_dwordx4;
5023 break;
5024 default:
5025 unreachable("Invalid data size for nir_intrinsic_store_scratch.");
5026 }
5027
5028 bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
5029 }
5030 }
5031
5032 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
5033 uint8_t log2_ps_iter_samples;
5034 if (ctx->program->info->ps.force_persample) {
5035 log2_ps_iter_samples =
5036 util_logbase2(ctx->options->key.fs.num_samples);
5037 } else {
5038 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
5039 }
5040
5041 /* The bit pattern matches that used by fixed function fragment
5042 * processing. */
5043 static const unsigned ps_iter_masks[] = {
5044 0xffff, /* not used */
5045 0x5555,
5046 0x1111,
5047 0x0101,
5048 0x0001,
5049 };
5050 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
5051
5052 Builder bld(ctx->program, ctx->block);
5053
5054 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
5055 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
5056 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
5057 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5058 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
5059 }
5060
5061 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
5062 {
5063 Builder bld(ctx->program, ctx->block);
5064
5065 if (cluster_size == 1) {
5066 return src;
5067 } if (op == nir_op_iand && cluster_size == 4) {
5068 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5069 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5070 return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
5071 bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
5072 } else if (op == nir_op_ior && cluster_size == 4) {
5073 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5074 return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5075 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5076 } else if (op == nir_op_iand && cluster_size == 64) {
5077 //subgroupAnd(val) -> (exec & ~val) == 0
5078 Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5079 return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
5080 } else if (op == nir_op_ior && cluster_size == 64) {
5081 //subgroupOr(val) -> (val & exec) != 0
5082 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5083 } else if (op == nir_op_ixor && cluster_size == 64) {
5084 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5085 Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5086 tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5087 return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5088 } else {
5089 //subgroupClustered{And,Or,Xor}(val, n) ->
5090 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5091 //cluster_offset = ~(n - 1) & lane_id
5092 //cluster_mask = ((1 << n) - 1)
5093 //subgroupClusteredAnd():
5094 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5095 //subgroupClusteredOr():
5096 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
5097 //subgroupClusteredXor():
5098 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5099 Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5100 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5101 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5102
5103 Temp tmp;
5104 if (op == nir_op_iand)
5105 tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5106 else
5107 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5108
5109 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5110 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5111 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5112 if (cluster_mask != 0xffffffff)
5113 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5114
5115 Definition cmp_def = Definition();
5116 if (op == nir_op_iand) {
5117 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5118 } else if (op == nir_op_ior) {
5119 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5120 } else if (op == nir_op_ixor) {
5121 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5122 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5123 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5124 }
5125 cmp_def.setHint(vcc);
5126 return cmp_def.getTemp();
5127 }
5128 }
5129
5130 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5131 {
5132 Builder bld(ctx->program, ctx->block);
5133
5134 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5135 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5136 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5137 Temp tmp;
5138 if (op == nir_op_iand)
5139 tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5140 else
5141 tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5142
5143 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5144 Temp lo = lohi.def(0).getTemp();
5145 Temp hi = lohi.def(1).getTemp();
5146 Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5147 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5148
5149 Definition cmp_def = Definition();
5150 if (op == nir_op_iand)
5151 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5152 else if (op == nir_op_ior)
5153 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5154 else if (op == nir_op_ixor)
5155 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5156 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5157 cmp_def.setHint(vcc);
5158 return cmp_def.getTemp();
5159 }
5160
5161 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5162 {
5163 Builder bld(ctx->program, ctx->block);
5164
5165 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5166 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5167 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5168 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5169 if (op == nir_op_iand)
5170 return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5171 else if (op == nir_op_ior)
5172 return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5173 else if (op == nir_op_ixor)
5174 return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5175
5176 assert(false);
5177 return Temp();
5178 }
5179
5180 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5181 {
5182 Builder bld(ctx->program, ctx->block);
5183 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5184 if (src.regClass().type() == RegType::vgpr) {
5185 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5186 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5187 bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5188 } else if (src.regClass() == s1) {
5189 bld.sop1(aco_opcode::s_mov_b32, dst, src);
5190 } else if (src.regClass() == s2) {
5191 bld.sop1(aco_opcode::s_mov_b64, dst, src);
5192 } else {
5193 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5194 nir_print_instr(&instr->instr, stderr);
5195 fprintf(stderr, "\n");
5196 }
5197 }
5198
5199 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5200 {
5201 Builder bld(ctx->program, ctx->block);
5202 Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5203 Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5204
5205 /* Build DD X/Y */
5206 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5207 Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5208 Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5209 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5210 Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5211 Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5212
5213 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5214 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5215 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5216 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5217 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5218 Temp wqm1 = bld.tmp(v1);
5219 emit_wqm(ctx, tmp1, wqm1, true);
5220 Temp wqm2 = bld.tmp(v1);
5221 emit_wqm(ctx, tmp2, wqm2, true);
5222 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5223 return;
5224 }
5225
5226 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5227 {
5228 Builder bld(ctx->program, ctx->block);
5229 switch(instr->intrinsic) {
5230 case nir_intrinsic_load_barycentric_sample:
5231 case nir_intrinsic_load_barycentric_pixel:
5232 case nir_intrinsic_load_barycentric_centroid: {
5233 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5234 fs_input input = get_interp_input(instr->intrinsic, mode);
5235
5236 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5237 if (input == fs_input::max_inputs) {
5238 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5239 Operand(0u), Operand(0u));
5240 } else {
5241 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5242 ctx->fs_inputs[input],
5243 ctx->fs_inputs[input + 1]);
5244 }
5245 emit_split_vector(ctx, dst, 2);
5246 break;
5247 }
5248 case nir_intrinsic_load_barycentric_at_sample: {
5249 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5250 switch (ctx->options->key.fs.num_samples) {
5251 case 2: sample_pos_offset += 1 << 3; break;
5252 case 4: sample_pos_offset += 3 << 3; break;
5253 case 8: sample_pos_offset += 7 << 3; break;
5254 default: break;
5255 }
5256 Temp sample_pos;
5257 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5258 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5259 if (addr.type() == RegType::sgpr) {
5260 Operand offset;
5261 if (const_addr) {
5262 sample_pos_offset += const_addr->u32 << 3;
5263 offset = Operand(sample_pos_offset);
5264 } else if (ctx->options->chip_class >= GFX9) {
5265 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5266 } else {
5267 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5268 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5269 }
5270 addr = ctx->private_segment_buffer;
5271 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5272
5273 } else if (ctx->options->chip_class >= GFX9) {
5274 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5275 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5276 } else {
5277 /* addr += ctx->private_segment_buffer + sample_pos_offset */
5278 Temp tmp0 = bld.tmp(s1);
5279 Temp tmp1 = bld.tmp(s1);
5280 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5281 Definition scc_tmp = bld.def(s1, scc);
5282 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5283 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5284 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5285 Temp pck0 = bld.tmp(v1);
5286 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5287 tmp1 = as_vgpr(ctx, tmp1);
5288 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5289 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5290
5291 /* sample_pos = flat_load_dwordx2 addr */
5292 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5293 }
5294
5295 /* sample_pos -= 0.5 */
5296 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5297 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5298 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5299 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5300 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5301
5302 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5303 break;
5304 }
5305 case nir_intrinsic_load_barycentric_at_offset: {
5306 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5307 RegClass rc = RegClass(offset.type(), 1);
5308 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5309 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5310 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5311 break;
5312 }
5313 case nir_intrinsic_load_front_face: {
5314 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5315 Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5316 break;
5317 }
5318 case nir_intrinsic_load_view_index:
5319 case nir_intrinsic_load_layer_id: {
5320 if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5321 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5322 bld.copy(Definition(dst), Operand(ctx->view_index));
5323 break;
5324 }
5325
5326 unsigned idx = nir_intrinsic_base(instr);
5327 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5328 Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5329 break;
5330 }
5331 case nir_intrinsic_load_frag_coord: {
5332 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5333 break;
5334 }
5335 case nir_intrinsic_load_sample_pos: {
5336 Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5337 Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5338 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5339 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5340 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5341 break;
5342 }
5343 case nir_intrinsic_load_interpolated_input:
5344 visit_load_interpolated_input(ctx, instr);
5345 break;
5346 case nir_intrinsic_store_output:
5347 visit_store_output(ctx, instr);
5348 break;
5349 case nir_intrinsic_load_input:
5350 visit_load_input(ctx, instr);
5351 break;
5352 case nir_intrinsic_load_ubo:
5353 visit_load_ubo(ctx, instr);
5354 break;
5355 case nir_intrinsic_load_push_constant:
5356 visit_load_push_constant(ctx, instr);
5357 break;
5358 case nir_intrinsic_load_constant:
5359 visit_load_constant(ctx, instr);
5360 break;
5361 case nir_intrinsic_vulkan_resource_index:
5362 visit_load_resource(ctx, instr);
5363 break;
5364 case nir_intrinsic_discard:
5365 visit_discard(ctx, instr);
5366 break;
5367 case nir_intrinsic_discard_if:
5368 visit_discard_if(ctx, instr);
5369 break;
5370 case nir_intrinsic_load_shared:
5371 visit_load_shared(ctx, instr);
5372 break;
5373 case nir_intrinsic_store_shared:
5374 visit_store_shared(ctx, instr);
5375 break;
5376 case nir_intrinsic_shared_atomic_add:
5377 case nir_intrinsic_shared_atomic_imin:
5378 case nir_intrinsic_shared_atomic_umin:
5379 case nir_intrinsic_shared_atomic_imax:
5380 case nir_intrinsic_shared_atomic_umax:
5381 case nir_intrinsic_shared_atomic_and:
5382 case nir_intrinsic_shared_atomic_or:
5383 case nir_intrinsic_shared_atomic_xor:
5384 case nir_intrinsic_shared_atomic_exchange:
5385 case nir_intrinsic_shared_atomic_comp_swap:
5386 visit_shared_atomic(ctx, instr);
5387 break;
5388 case nir_intrinsic_image_deref_load:
5389 visit_image_load(ctx, instr);
5390 break;
5391 case nir_intrinsic_image_deref_store:
5392 visit_image_store(ctx, instr);
5393 break;
5394 case nir_intrinsic_image_deref_atomic_add:
5395 case nir_intrinsic_image_deref_atomic_umin:
5396 case nir_intrinsic_image_deref_atomic_imin:
5397 case nir_intrinsic_image_deref_atomic_umax:
5398 case nir_intrinsic_image_deref_atomic_imax:
5399 case nir_intrinsic_image_deref_atomic_and:
5400 case nir_intrinsic_image_deref_atomic_or:
5401 case nir_intrinsic_image_deref_atomic_xor:
5402 case nir_intrinsic_image_deref_atomic_exchange:
5403 case nir_intrinsic_image_deref_atomic_comp_swap:
5404 visit_image_atomic(ctx, instr);
5405 break;
5406 case nir_intrinsic_image_deref_size:
5407 visit_image_size(ctx, instr);
5408 break;
5409 case nir_intrinsic_load_ssbo:
5410 visit_load_ssbo(ctx, instr);
5411 break;
5412 case nir_intrinsic_store_ssbo:
5413 visit_store_ssbo(ctx, instr);
5414 break;
5415 case nir_intrinsic_load_global:
5416 visit_load_global(ctx, instr);
5417 break;
5418 case nir_intrinsic_store_global:
5419 visit_store_global(ctx, instr);
5420 break;
5421 case nir_intrinsic_ssbo_atomic_add:
5422 case nir_intrinsic_ssbo_atomic_imin:
5423 case nir_intrinsic_ssbo_atomic_umin:
5424 case nir_intrinsic_ssbo_atomic_imax:
5425 case nir_intrinsic_ssbo_atomic_umax:
5426 case nir_intrinsic_ssbo_atomic_and:
5427 case nir_intrinsic_ssbo_atomic_or:
5428 case nir_intrinsic_ssbo_atomic_xor:
5429 case nir_intrinsic_ssbo_atomic_exchange:
5430 case nir_intrinsic_ssbo_atomic_comp_swap:
5431 visit_atomic_ssbo(ctx, instr);
5432 break;
5433 case nir_intrinsic_load_scratch:
5434 visit_load_scratch(ctx, instr);
5435 break;
5436 case nir_intrinsic_store_scratch:
5437 visit_store_scratch(ctx, instr);
5438 break;
5439 case nir_intrinsic_get_buffer_size:
5440 visit_get_buffer_size(ctx, instr);
5441 break;
5442 case nir_intrinsic_barrier: {
5443 unsigned* bsize = ctx->program->info->cs.block_size;
5444 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5445 if (workgroup_size > 64)
5446 bld.sopp(aco_opcode::s_barrier);
5447 break;
5448 }
5449 case nir_intrinsic_group_memory_barrier:
5450 case nir_intrinsic_memory_barrier:
5451 case nir_intrinsic_memory_barrier_atomic_counter:
5452 case nir_intrinsic_memory_barrier_buffer:
5453 case nir_intrinsic_memory_barrier_image:
5454 case nir_intrinsic_memory_barrier_shared:
5455 emit_memory_barrier(ctx, instr);
5456 break;
5457 case nir_intrinsic_load_num_work_groups:
5458 case nir_intrinsic_load_work_group_id:
5459 case nir_intrinsic_load_local_invocation_id: {
5460 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5461 Temp* ids;
5462 if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5463 ids = ctx->num_workgroups;
5464 else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5465 ids = ctx->workgroup_ids;
5466 else
5467 ids = ctx->local_invocation_ids;
5468 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5469 ids[0].id() ? Operand(ids[0]) : Operand(1u),
5470 ids[1].id() ? Operand(ids[1]) : Operand(1u),
5471 ids[2].id() ? Operand(ids[2]) : Operand(1u));
5472 emit_split_vector(ctx, dst, 3);
5473 break;
5474 }
5475 case nir_intrinsic_load_local_invocation_index: {
5476 Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5477 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5478 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5479 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5480 break;
5481 }
5482 case nir_intrinsic_load_subgroup_id: {
5483 if (ctx->stage == compute_cs) {
5484 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5485 bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5486 } else {
5487 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5488 }
5489 break;
5490 }
5491 case nir_intrinsic_load_subgroup_invocation: {
5492 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5493 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5494 break;
5495 }
5496 case nir_intrinsic_load_num_subgroups: {
5497 if (ctx->stage == compute_cs)
5498 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5499 else
5500 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5501 break;
5502 }
5503 case nir_intrinsic_ballot: {
5504 Definition tmp = bld.def(s2);
5505 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5506 if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5507 bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5508 } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5509 bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5510 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5511 bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5512 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5513 bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5514 } else {
5515 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5516 nir_print_instr(&instr->instr, stderr);
5517 fprintf(stderr, "\n");
5518 }
5519 emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5520 break;
5521 }
5522 case nir_intrinsic_shuffle: {
5523 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5524 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5525 emit_uniform_subgroup(ctx, instr, src);
5526 } else {
5527 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5528 assert(tid.regClass() == v1);
5529 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5530 if (src.regClass() == v1) {
5531 tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5532 emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5533 } else if (src.regClass() == v2) {
5534 tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5535
5536 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5537 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5538 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5539 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5540 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5541 emit_split_vector(ctx, dst, 2);
5542 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5543 Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5544 tmp = emit_extract_vector(ctx, tmp, 0, v1);
5545 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5546 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5547 } else {
5548 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5549 nir_print_instr(&instr->instr, stderr);
5550 fprintf(stderr, "\n");
5551 }
5552 }
5553 break;
5554 }
5555 case nir_intrinsic_load_sample_id: {
5556 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5557 ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5558 break;
5559 }
5560 case nir_intrinsic_load_sample_mask_in: {
5561 visit_load_sample_mask_in(ctx, instr);
5562 break;
5563 }
5564 case nir_intrinsic_read_first_invocation: {
5565 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5566 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5567 if (src.regClass() == v1) {
5568 emit_wqm(ctx,
5569 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5570 dst);
5571 } else if (src.regClass() == v2) {
5572 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5573 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5574 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5575 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5576 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5577 emit_split_vector(ctx, dst, 2);
5578 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5579 emit_wqm(ctx,
5580 bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5581 bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5582 dst);
5583 } else if (src.regClass() == s1) {
5584 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5585 } else if (src.regClass() == s2) {
5586 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5587 } else {
5588 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5589 nir_print_instr(&instr->instr, stderr);
5590 fprintf(stderr, "\n");
5591 }
5592 break;
5593 }
5594 case nir_intrinsic_read_invocation: {
5595 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5596 Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5597 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5598 assert(lane.regClass() == s1);
5599 if (src.regClass() == v1) {
5600 emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5601 } else if (src.regClass() == v2) {
5602 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5603 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5604 lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5605 hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5606 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5607 emit_split_vector(ctx, dst, 2);
5608 } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5609 emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5610 } else if (src.regClass() == s1) {
5611 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5612 } else if (src.regClass() == s2) {
5613 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5614 } else {
5615 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5616 nir_print_instr(&instr->instr, stderr);
5617 fprintf(stderr, "\n");
5618 }
5619 break;
5620 }
5621 case nir_intrinsic_vote_all: {
5622 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5623 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5624 assert(src.regClass() == s2);
5625 assert(dst.regClass() == s1);
5626
5627 Definition tmp = bld.def(s1);
5628 bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5629 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5630 Operand(exec, s2));
5631 emit_wqm(ctx, tmp.getTemp(), dst);
5632 break;
5633 }
5634 case nir_intrinsic_vote_any: {
5635 Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5636 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5637 assert(src.regClass() == s2);
5638 assert(dst.regClass() == s1);
5639
5640 Definition tmp = bld.def(s1);
5641 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5642 emit_wqm(ctx, tmp.getTemp(), dst);
5643 break;
5644 }
5645 case nir_intrinsic_reduce:
5646 case nir_intrinsic_inclusive_scan:
5647 case nir_intrinsic_exclusive_scan: {
5648 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5649 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5650 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5651 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5652 nir_intrinsic_cluster_size(instr) : 0;
5653 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5654
5655 if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5656 emit_uniform_subgroup(ctx, instr, src);
5657 } else if (instr->dest.ssa.bit_size == 1) {
5658 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5659 op = nir_op_iand;
5660 else if (op == nir_op_iadd)
5661 op = nir_op_ixor;
5662 else if (op == nir_op_umax || op == nir_op_imax)
5663 op = nir_op_ior;
5664 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5665
5666 switch (instr->intrinsic) {
5667 case nir_intrinsic_reduce:
5668 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5669 break;
5670 case nir_intrinsic_exclusive_scan:
5671 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5672 break;
5673 case nir_intrinsic_inclusive_scan:
5674 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5675 break;
5676 default:
5677 assert(false);
5678 }
5679 } else if (cluster_size == 1) {
5680 bld.copy(Definition(dst), src);
5681 } else {
5682 src = as_vgpr(ctx, src);
5683
5684 ReduceOp reduce_op;
5685 switch (op) {
5686 #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5687 CASE(iadd)
5688 CASE(imul)
5689 CASE(fadd)
5690 CASE(fmul)
5691 CASE(imin)
5692 CASE(umin)
5693 CASE(fmin)
5694 CASE(imax)
5695 CASE(umax)
5696 CASE(fmax)
5697 CASE(iand)
5698 CASE(ior)
5699 CASE(ixor)
5700 default:
5701 unreachable("unknown reduction op");
5702 #undef CASE
5703 }
5704
5705 aco_opcode aco_op;
5706 switch (instr->intrinsic) {
5707 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5708 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5709 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5710 default:
5711 unreachable("unknown reduce intrinsic");
5712 }
5713
5714 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5715 reduce->operands[0] = Operand(src);
5716 // filled in by aco_reduce_assign.cpp, used internally as part of the
5717 // reduce sequence
5718 assert(dst.size() == 1 || dst.size() == 2);
5719 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5720 reduce->operands[2] = Operand(v1.as_linear());
5721
5722 Temp tmp_dst = bld.tmp(dst.regClass());
5723 reduce->definitions[0] = Definition(tmp_dst);
5724 reduce->definitions[1] = bld.def(s2); // used internally
5725 reduce->definitions[2] = Definition();
5726 reduce->definitions[3] = Definition(scc, s1);
5727 reduce->definitions[4] = Definition();
5728 reduce->reduce_op = reduce_op;
5729 reduce->cluster_size = cluster_size;
5730 ctx->block->instructions.emplace_back(std::move(reduce));
5731
5732 emit_wqm(ctx, tmp_dst, dst);
5733 }
5734 break;
5735 }
5736 case nir_intrinsic_quad_broadcast: {
5737 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5738 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5739 emit_uniform_subgroup(ctx, instr, src);
5740 } else {
5741 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5742 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5743 if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5744 uint32_t half_mask = 0x11111111u << lane;
5745 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5746 Temp tmp = bld.tmp(s2);
5747 bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5748 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5749 bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5750 emit_wqm(ctx, tmp, dst);
5751 } else if (instr->dest.ssa.bit_size == 32) {
5752 emit_wqm(ctx,
5753 bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5754 dpp_quad_perm(lane, lane, lane, lane)),
5755 dst);
5756 } else if (instr->dest.ssa.bit_size == 64) {
5757 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5758 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5759 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5760 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5761 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5762 emit_split_vector(ctx, dst, 2);
5763 } else {
5764 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5765 nir_print_instr(&instr->instr, stderr);
5766 fprintf(stderr, "\n");
5767 }
5768 }
5769 break;
5770 }
5771 case nir_intrinsic_quad_swap_horizontal:
5772 case nir_intrinsic_quad_swap_vertical:
5773 case nir_intrinsic_quad_swap_diagonal:
5774 case nir_intrinsic_quad_swizzle_amd: {
5775 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5776 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5777 emit_uniform_subgroup(ctx, instr, src);
5778 break;
5779 }
5780 uint16_t dpp_ctrl = 0;
5781 switch (instr->intrinsic) {
5782 case nir_intrinsic_quad_swap_horizontal:
5783 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5784 break;
5785 case nir_intrinsic_quad_swap_vertical:
5786 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5787 break;
5788 case nir_intrinsic_quad_swap_diagonal:
5789 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5790 break;
5791 case nir_intrinsic_quad_swizzle_amd: {
5792 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5793 break;
5794 }
5795 default:
5796 break;
5797 }
5798
5799 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5800 if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5801 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5802 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5803 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5804 emit_wqm(ctx, tmp, dst);
5805 } else if (instr->dest.ssa.bit_size == 32) {
5806 Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5807 emit_wqm(ctx, tmp, dst);
5808 } else if (instr->dest.ssa.bit_size == 64) {
5809 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5810 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5811 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5812 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5813 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5814 emit_split_vector(ctx, dst, 2);
5815 } else {
5816 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5817 nir_print_instr(&instr->instr, stderr);
5818 fprintf(stderr, "\n");
5819 }
5820 break;
5821 }
5822 case nir_intrinsic_masked_swizzle_amd: {
5823 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5824 if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5825 emit_uniform_subgroup(ctx, instr, src);
5826 break;
5827 }
5828 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5829 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5830 if (dst.regClass() == v1) {
5831 emit_wqm(ctx,
5832 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5833 dst);
5834 } else if (dst.regClass() == v2) {
5835 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5836 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5837 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5838 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5839 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5840 emit_split_vector(ctx, dst, 2);
5841 } else {
5842 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5843 nir_print_instr(&instr->instr, stderr);
5844 fprintf(stderr, "\n");
5845 }
5846 break;
5847 }
5848 case nir_intrinsic_write_invocation_amd: {
5849 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5850 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5851 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5852 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5853 if (dst.regClass() == v1) {
5854 /* src2 is ignored for writelane. RA assigns the same reg for dst */
5855 emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5856 } else if (dst.regClass() == v2) {
5857 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5858 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5859 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5860 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5861 Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5862 Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5863 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5864 emit_split_vector(ctx, dst, 2);
5865 } else {
5866 fprintf(stderr, "Unimplemented NIR instr bit size: ");
5867 nir_print_instr(&instr->instr, stderr);
5868 fprintf(stderr, "\n");
5869 }
5870 break;
5871 }
5872 case nir_intrinsic_mbcnt_amd: {
5873 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5874 RegClass rc = RegClass(src.type(), 1);
5875 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5876 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5877 Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5878 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5879 Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5880 emit_wqm(ctx, wqm_tmp, dst);
5881 break;
5882 }
5883 case nir_intrinsic_load_helper_invocation: {
5884 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5885 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5886 ctx->block->kind |= block_kind_needs_lowering;
5887 ctx->program->needs_exact = true;
5888 break;
5889 }
5890 case nir_intrinsic_is_helper_invocation: {
5891 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5892 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5893 ctx->block->kind |= block_kind_needs_lowering;
5894 ctx->program->needs_exact = true;
5895 break;
5896 }
5897 case nir_intrinsic_demote:
5898 bld.pseudo(aco_opcode::p_demote_to_helper);
5899 ctx->block->kind |= block_kind_uses_demote;
5900 ctx->program->needs_exact = true;
5901 break;
5902 case nir_intrinsic_demote_if: {
5903 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5904 as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5905 Operand(exec, s2));
5906 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5907 ctx->block->kind |= block_kind_uses_demote;
5908 ctx->program->needs_exact = true;
5909 break;
5910 }
5911 case nir_intrinsic_first_invocation: {
5912 emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5913 get_ssa_temp(ctx, &instr->dest.ssa));
5914 break;
5915 }
5916 case nir_intrinsic_shader_clock:
5917 bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
5918 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
5919 break;
5920 case nir_intrinsic_load_vertex_id_zero_base: {
5921 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5922 bld.copy(Definition(dst), ctx->vertex_id);
5923 break;
5924 }
5925 case nir_intrinsic_load_first_vertex: {
5926 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5927 bld.copy(Definition(dst), ctx->base_vertex);
5928 break;
5929 }
5930 case nir_intrinsic_load_base_instance: {
5931 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5932 bld.copy(Definition(dst), ctx->start_instance);
5933 break;
5934 }
5935 case nir_intrinsic_load_instance_id: {
5936 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5937 bld.copy(Definition(dst), ctx->instance_id);
5938 break;
5939 }
5940 case nir_intrinsic_load_draw_id: {
5941 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5942 bld.copy(Definition(dst), ctx->draw_id);
5943 break;
5944 }
5945 default:
5946 fprintf(stderr, "Unimplemented intrinsic instr: ");
5947 nir_print_instr(&instr->instr, stderr);
5948 fprintf(stderr, "\n");
5949 abort();
5950
5951 break;
5952 }
5953 }
5954
5955
5956 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5957 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5958 enum glsl_base_type *stype)
5959 {
5960 nir_deref_instr *texture_deref_instr = NULL;
5961 nir_deref_instr *sampler_deref_instr = NULL;
5962 int plane = -1;
5963
5964 for (unsigned i = 0; i < instr->num_srcs; i++) {
5965 switch (instr->src[i].src_type) {
5966 case nir_tex_src_texture_deref:
5967 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5968 break;
5969 case nir_tex_src_sampler_deref:
5970 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5971 break;
5972 case nir_tex_src_plane:
5973 plane = nir_src_as_int(instr->src[i].src);
5974 break;
5975 default:
5976 break;
5977 }
5978 }
5979
5980 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5981
5982 if (!sampler_deref_instr)
5983 sampler_deref_instr = texture_deref_instr;
5984
5985 if (plane >= 0) {
5986 assert(instr->op != nir_texop_txf_ms &&
5987 instr->op != nir_texop_samples_identical);
5988 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
5989 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5990 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
5991 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5992 } else {
5993 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5994 }
5995 if (samp_ptr) {
5996 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5997 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5998 fprintf(stderr, "Unimplemented sampler descriptor: ");
5999 nir_print_instr(&instr->instr, stderr);
6000 fprintf(stderr, "\n");
6001 abort();
6002 // TODO: build samp_ptr = and(samp_ptr, res_ptr)
6003 }
6004 }
6005 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
6006 instr->op == nir_texop_samples_identical))
6007 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
6008 }
6009
6010 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
6011 Temp *out_ma, Temp *out_sc, Temp *out_tc)
6012 {
6013 Builder bld(ctx->program, ctx->block);
6014
6015 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
6016 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
6017 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
6018
6019 Operand neg_one(0xbf800000u);
6020 Operand one(0x3f800000u);
6021 Operand two(0x40000000u);
6022 Operand four(0x40800000u);
6023
6024 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
6025 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
6026 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
6027
6028 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
6029 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
6030 is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
6031 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
6032
6033 // select sc
6034 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
6035 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
6036 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
6037 one, is_ma_y);
6038 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6039
6040 // select tc
6041 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
6042 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
6043 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6044
6045 // select ma
6046 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6047 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
6048 deriv_z, is_ma_z);
6049 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
6050 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
6051 }
6052
6053 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
6054 {
6055 Builder bld(ctx->program, ctx->block);
6056 Temp coord_args[4], ma, tc, sc, id;
6057 for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
6058 coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
6059
6060 if (is_array) {
6061 coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
6062
6063 // see comment in ac_prepare_cube_coords()
6064 if (ctx->options->chip_class <= GFX8)
6065 coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
6066 }
6067
6068 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6069
6070 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6071 vop3a->operands[0] = Operand(ma);
6072 vop3a->abs[0] = true;
6073 Temp invma = bld.tmp(v1);
6074 vop3a->definitions[0] = Definition(invma);
6075 ctx->block->instructions.emplace_back(std::move(vop3a));
6076
6077 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6078 if (!is_deriv)
6079 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6080
6081 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6082 if (!is_deriv)
6083 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6084
6085 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6086
6087 if (is_deriv) {
6088 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6089 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6090
6091 for (unsigned i = 0; i < 2; i++) {
6092 // see comment in ac_prepare_cube_coords()
6093 Temp deriv_ma;
6094 Temp deriv_sc, deriv_tc;
6095 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6096 &deriv_ma, &deriv_sc, &deriv_tc);
6097
6098 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6099
6100 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6101 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6102 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6103 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6104 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6105 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6106 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6107 }
6108
6109 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6110 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6111 }
6112
6113 if (is_array)
6114 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6115 *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6116
6117 }
6118
6119 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6120 {
6121 Temp coord_vec[3];
6122 for (unsigned i = 0; i < coords.size(); i++)
6123 coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6124
6125 Builder bld(ctx->program, ctx->block);
6126 coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6127
6128 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6129 for (unsigned i = 0; i < coords.size(); i++)
6130 vec->operands[i] = Operand(coord_vec[i]);
6131 Temp res = bld.tmp(RegType::vgpr, coords.size());
6132 vec->definitions[0] = Definition(res);
6133 ctx->block->instructions.emplace_back(std::move(vec));
6134 return res;
6135 }
6136
6137 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6138 {
6139 if (vec->parent_instr->type != nir_instr_type_alu)
6140 return;
6141 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6142 if (vec_instr->op != nir_op_vec(vec->num_components))
6143 return;
6144
6145 for (unsigned i = 0; i < vec->num_components; i++) {
6146 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6147 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6148 }
6149 }
6150
6151 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6152 {
6153 Builder bld(ctx->program, ctx->block);
6154 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6155 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6156 Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6157 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6158 nir_const_value *sample_index_cv = NULL;
6159 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6160 enum glsl_base_type stype;
6161 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6162
6163 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6164 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6165 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6166 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6167
6168 for (unsigned i = 0; i < instr->num_srcs; i++) {
6169 switch (instr->src[i].src_type) {
6170 case nir_tex_src_coord:
6171 coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6172 break;
6173 case nir_tex_src_bias:
6174 if (instr->op == nir_texop_txb) {
6175 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6176 has_bias = true;
6177 }
6178 break;
6179 case nir_tex_src_lod: {
6180 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6181
6182 if (val && val->f32 <= 0.0) {
6183 level_zero = true;
6184 } else {
6185 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6186 has_lod = true;
6187 }
6188 break;
6189 }
6190 case nir_tex_src_comparator:
6191 if (instr->is_shadow) {
6192 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6193 has_compare = true;
6194 }
6195 break;
6196 case nir_tex_src_offset:
6197 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6198 get_const_vec(instr->src[i].src.ssa, const_offset);
6199 has_offset = true;
6200 break;
6201 case nir_tex_src_ddx:
6202 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6203 has_ddx = true;
6204 break;
6205 case nir_tex_src_ddy:
6206 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6207 has_ddy = true;
6208 break;
6209 case nir_tex_src_ms_index:
6210 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6211 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6212 has_sample_index = true;
6213 break;
6214 case nir_tex_src_texture_offset:
6215 case nir_tex_src_sampler_offset:
6216 default:
6217 break;
6218 }
6219 }
6220 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6221 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6222 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6223
6224 if (instr->op == nir_texop_texture_samples) {
6225 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6226
6227 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6228 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6229 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6230 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6231
6232 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6233 samples, Operand(1u), bld.scc(is_msaa));
6234 return;
6235 }
6236
6237 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6238 aco_ptr<Instruction> tmp_instr;
6239 Temp acc, pack = Temp();
6240
6241 uint32_t pack_const = 0;
6242 for (unsigned i = 0; i < offset.size(); i++) {
6243 if (!const_offset[i])
6244 continue;
6245 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6246 }
6247
6248 if (offset.type() == RegType::sgpr) {
6249 for (unsigned i = 0; i < offset.size(); i++) {
6250 if (const_offset[i])
6251 continue;
6252
6253 acc = emit_extract_vector(ctx, offset, i, s1);
6254 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6255
6256 if (i) {
6257 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6258 }
6259
6260 if (pack == Temp()) {
6261 pack = acc;
6262 } else {
6263 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6264 }
6265 }
6266
6267 if (pack_const && pack != Temp())
6268 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6269 } else {
6270 for (unsigned i = 0; i < offset.size(); i++) {
6271 if (const_offset[i])
6272 continue;
6273
6274 acc = emit_extract_vector(ctx, offset, i, v1);
6275 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6276
6277 if (i) {
6278 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6279 }
6280
6281 if (pack == Temp()) {
6282 pack = acc;
6283 } else {
6284 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6285 }
6286 }
6287
6288 if (pack_const && pack != Temp())
6289 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6290 }
6291 if (pack_const && pack == Temp())
6292 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6293 else if (pack == Temp())
6294 has_offset = false;
6295 else
6296 offset = pack;
6297 }
6298
6299 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6300 prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6301
6302 /* pack derivatives */
6303 if (has_ddx || has_ddy) {
6304 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6305 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6306 ddx, Operand(0u), ddy, Operand(0u));
6307 } else {
6308 derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6309 }
6310 has_derivs = true;
6311 }
6312
6313 if (instr->coord_components > 1 &&
6314 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6315 instr->is_array &&
6316 instr->op != nir_texop_txf)
6317 coords = apply_round_slice(ctx, coords, 1);
6318
6319 if (instr->coord_components > 2 &&
6320 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6321 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6322 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6323 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6324 instr->is_array &&
6325 instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6326 coords = apply_round_slice(ctx, coords, 2);
6327
6328 if (ctx->options->chip_class == GFX9 &&
6329 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6330 instr->op != nir_texop_lod && instr->coord_components) {
6331 assert(coords.size() > 0 && coords.size() < 3);
6332
6333 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6334 vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6335 vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6336 if (coords.size() > 1)
6337 vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6338 coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6339 vec->definitions[0] = Definition(coords);
6340 ctx->block->instructions.emplace_back(std::move(vec));
6341 }
6342
6343 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6344
6345 if (instr->op == nir_texop_samples_identical)
6346 resource = fmask_ptr;
6347
6348 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6349 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6350 instr->op != nir_texop_txs) {
6351 assert(has_sample_index);
6352 Operand op(sample_index);
6353 if (sample_index_cv)
6354 op = Operand(sample_index_cv->u32);
6355 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6356 }
6357
6358 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6359 Temp split_coords[coords.size()];
6360 emit_split_vector(ctx, coords, coords.size());
6361 for (unsigned i = 0; i < coords.size(); i++)
6362 split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6363
6364 unsigned i = 0;
6365 for (; i < std::min(offset.size(), instr->coord_components); i++) {
6366 Temp off = emit_extract_vector(ctx, offset, i, v1);
6367 split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6368 }
6369
6370 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6371 for (unsigned i = 0; i < coords.size(); i++)
6372 vec->operands[i] = Operand(split_coords[i]);
6373 coords = bld.tmp(coords.regClass());
6374 vec->definitions[0] = Definition(coords);
6375 ctx->block->instructions.emplace_back(std::move(vec));
6376
6377 has_offset = false;
6378 }
6379
6380 /* Build tex instruction */
6381 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6382 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6383 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6384 : 0;
6385 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6386 Temp tmp_dst = dst;
6387
6388 /* gather4 selects the component by dmask and always returns vec4 */
6389 if (instr->op == nir_texop_tg4) {
6390 assert(instr->dest.ssa.num_components == 4);
6391 if (instr->is_shadow)
6392 dmask = 1;
6393 else
6394 dmask = 1 << instr->component;
6395 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6396 tmp_dst = bld.tmp(v4);
6397 } else if (instr->op == nir_texop_samples_identical) {
6398 tmp_dst = bld.tmp(v1);
6399 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6400 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6401 }
6402
6403 aco_ptr<MIMG_instruction> tex;
6404 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6405 if (!has_lod)
6406 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6407
6408 bool div_by_6 = instr->op == nir_texop_txs &&
6409 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6410 instr->is_array &&
6411 (dmask & (1 << 2));
6412 if (tmp_dst.id() == dst.id() && div_by_6)
6413 tmp_dst = bld.tmp(tmp_dst.regClass());
6414
6415 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6416 tex->operands[0] = Operand(as_vgpr(ctx,lod));
6417 tex->operands[1] = Operand(resource);
6418 if (ctx->options->chip_class == GFX9 &&
6419 instr->op == nir_texop_txs &&
6420 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6421 instr->is_array) {
6422 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6423 } else if (instr->op == nir_texop_query_levels) {
6424 tex->dmask = 1 << 3;
6425 } else {
6426 tex->dmask = dmask;
6427 }
6428 tex->da = da;
6429 tex->definitions[0] = Definition(tmp_dst);
6430 tex->dim = dim;
6431 tex->can_reorder = true;
6432 ctx->block->instructions.emplace_back(std::move(tex));
6433
6434 if (div_by_6) {
6435 /* divide 3rd value by 6 by multiplying with magic number */
6436 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6437 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6438 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6439 assert(instr->dest.ssa.num_components == 3);
6440 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6441 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6442 emit_extract_vector(ctx, tmp_dst, 0, v1),
6443 emit_extract_vector(ctx, tmp_dst, 1, v1),
6444 by_6);
6445
6446 }
6447
6448 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6449 return;
6450 }
6451
6452 Temp tg4_compare_cube_wa64 = Temp();
6453
6454 if (tg4_integer_workarounds) {
6455 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6456 tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6457 tex->operands[1] = Operand(resource);
6458 tex->dim = dim;
6459 tex->dmask = 0x3;
6460 tex->da = da;
6461 Temp size = bld.tmp(v2);
6462 tex->definitions[0] = Definition(size);
6463 tex->can_reorder = true;
6464 ctx->block->instructions.emplace_back(std::move(tex));
6465 emit_split_vector(ctx, size, size.size());
6466
6467 Temp half_texel[2];
6468 for (unsigned i = 0; i < 2; i++) {
6469 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6470 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6471 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6472 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6473 }
6474
6475 Temp orig_coords[2] = {
6476 emit_extract_vector(ctx, coords, 0, v1),
6477 emit_extract_vector(ctx, coords, 1, v1)};
6478 Temp new_coords[2] = {
6479 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6480 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6481 };
6482
6483 if (tg4_integer_cube_workaround) {
6484 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6485 Temp desc[resource.size()];
6486 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6487 Format::PSEUDO, 1, resource.size())};
6488 split->operands[0] = Operand(resource);
6489 for (unsigned i = 0; i < resource.size(); i++) {
6490 desc[i] = bld.tmp(s1);
6491 split->definitions[i] = Definition(desc[i]);
6492 }
6493 ctx->block->instructions.emplace_back(std::move(split));
6494
6495 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6496 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6497 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6498
6499 Temp nfmt;
6500 if (stype == GLSL_TYPE_UINT) {
6501 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6502 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6503 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6504 bld.scc(compare_cube_wa));
6505 } else {
6506 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6507 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6508 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6509 bld.scc(compare_cube_wa));
6510 }
6511 tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6512 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6513
6514 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6515 Operand((uint32_t)C_008F14_NUM_FORMAT));
6516 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6517
6518 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6519 Format::PSEUDO, resource.size(), 1)};
6520 for (unsigned i = 0; i < resource.size(); i++)
6521 vec->operands[i] = Operand(desc[i]);
6522 resource = bld.tmp(resource.regClass());
6523 vec->definitions[0] = Definition(resource);
6524 ctx->block->instructions.emplace_back(std::move(vec));
6525
6526 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6527 new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6528 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6529 new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6530 }
6531
6532 if (coords.size() == 3) {
6533 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6534 new_coords[0], new_coords[1],
6535 emit_extract_vector(ctx, coords, 2, v1));
6536 } else {
6537 assert(coords.size() == 2);
6538 coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6539 new_coords[0], new_coords[1]);
6540 }
6541 }
6542
6543 if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6544 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6545 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6546 coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6547
6548 std::vector<Operand> args;
6549 if (has_offset)
6550 args.emplace_back(Operand(offset));
6551 if (has_bias)
6552 args.emplace_back(Operand(bias));
6553 if (has_compare)
6554 args.emplace_back(Operand(compare));
6555 if (has_derivs)
6556 args.emplace_back(Operand(derivs));
6557 args.emplace_back(Operand(coords));
6558 if (has_sample_index)
6559 args.emplace_back(Operand(sample_index));
6560 if (has_lod)
6561 args.emplace_back(lod);
6562
6563 Operand arg;
6564 if (args.size() > 1) {
6565 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6566 unsigned size = 0;
6567 for (unsigned i = 0; i < args.size(); i++) {
6568 size += args[i].size();
6569 vec->operands[i] = args[i];
6570 }
6571 RegClass rc = RegClass(RegType::vgpr, size);
6572 Temp tmp = bld.tmp(rc);
6573 vec->definitions[0] = Definition(tmp);
6574 ctx->block->instructions.emplace_back(std::move(vec));
6575 arg = Operand(tmp);
6576 } else {
6577 assert(args[0].isTemp());
6578 arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6579 }
6580
6581 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6582 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6583
6584 assert(coords.size() == 1);
6585 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6586 aco_opcode op;
6587 switch (last_bit) {
6588 case 1:
6589 op = aco_opcode::buffer_load_format_x; break;
6590 case 2:
6591 op = aco_opcode::buffer_load_format_xy; break;
6592 case 3:
6593 op = aco_opcode::buffer_load_format_xyz; break;
6594 case 4:
6595 op = aco_opcode::buffer_load_format_xyzw; break;
6596 default:
6597 unreachable("Tex instruction loads more than 4 components.");
6598 }
6599
6600 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6601 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6602 tmp_dst = dst;
6603 else
6604 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6605
6606 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6607 mubuf->operands[0] = Operand(coords);
6608 mubuf->operands[1] = Operand(resource);
6609 mubuf->operands[2] = Operand((uint32_t) 0);
6610 mubuf->definitions[0] = Definition(tmp_dst);
6611 mubuf->idxen = true;
6612 mubuf->can_reorder = true;
6613 ctx->block->instructions.emplace_back(std::move(mubuf));
6614
6615 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6616 return;
6617 }
6618
6619
6620 if (instr->op == nir_texop_txf ||
6621 instr->op == nir_texop_txf_ms ||
6622 instr->op == nir_texop_samples_identical) {
6623 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6624 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6625 tex->operands[0] = Operand(arg);
6626 tex->operands[1] = Operand(resource);
6627 tex->dim = dim;
6628 tex->dmask = dmask;
6629 tex->unrm = true;
6630 tex->da = da;
6631 tex->definitions[0] = Definition(tmp_dst);
6632 tex->can_reorder = true;
6633 ctx->block->instructions.emplace_back(std::move(tex));
6634
6635 if (instr->op == nir_texop_samples_identical) {
6636 assert(dmask == 1 && dst.regClass() == v1);
6637 assert(dst.id() != tmp_dst.id());
6638
6639 Temp tmp = bld.tmp(s2);
6640 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6641 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6642
6643 } else {
6644 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6645 }
6646 return;
6647 }
6648
6649 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6650 aco_opcode opcode = aco_opcode::image_sample;
6651 if (has_offset) { /* image_sample_*_o */
6652 if (has_compare) {
6653 opcode = aco_opcode::image_sample_c_o;
6654 if (has_derivs)
6655 opcode = aco_opcode::image_sample_c_d_o;
6656 if (has_bias)
6657 opcode = aco_opcode::image_sample_c_b_o;
6658 if (level_zero)
6659 opcode = aco_opcode::image_sample_c_lz_o;
6660 if (has_lod)
6661 opcode = aco_opcode::image_sample_c_l_o;
6662 } else {
6663 opcode = aco_opcode::image_sample_o;
6664 if (has_derivs)
6665 opcode = aco_opcode::image_sample_d_o;
6666 if (has_bias)
6667 opcode = aco_opcode::image_sample_b_o;
6668 if (level_zero)
6669 opcode = aco_opcode::image_sample_lz_o;
6670 if (has_lod)
6671 opcode = aco_opcode::image_sample_l_o;
6672 }
6673 } else { /* no offset */
6674 if (has_compare) {
6675 opcode = aco_opcode::image_sample_c;
6676 if (has_derivs)
6677 opcode = aco_opcode::image_sample_c_d;
6678 if (has_bias)
6679 opcode = aco_opcode::image_sample_c_b;
6680 if (level_zero)
6681 opcode = aco_opcode::image_sample_c_lz;
6682 if (has_lod)
6683 opcode = aco_opcode::image_sample_c_l;
6684 } else {
6685 opcode = aco_opcode::image_sample;
6686 if (has_derivs)
6687 opcode = aco_opcode::image_sample_d;
6688 if (has_bias)
6689 opcode = aco_opcode::image_sample_b;
6690 if (level_zero)
6691 opcode = aco_opcode::image_sample_lz;
6692 if (has_lod)
6693 opcode = aco_opcode::image_sample_l;
6694 }
6695 }
6696
6697 if (instr->op == nir_texop_tg4) {
6698 if (has_offset) {
6699 opcode = aco_opcode::image_gather4_lz_o;
6700 if (has_compare)
6701 opcode = aco_opcode::image_gather4_c_lz_o;
6702 } else {
6703 opcode = aco_opcode::image_gather4_lz;
6704 if (has_compare)
6705 opcode = aco_opcode::image_gather4_c_lz;
6706 }
6707 } else if (instr->op == nir_texop_lod) {
6708 opcode = aco_opcode::image_get_lod;
6709 }
6710
6711 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6712 tex->operands[0] = arg;
6713 tex->operands[1] = Operand(resource);
6714 tex->operands[2] = Operand(sampler);
6715 tex->dim = dim;
6716 tex->dmask = dmask;
6717 tex->da = da;
6718 tex->definitions[0] = Definition(tmp_dst);
6719 tex->can_reorder = true;
6720 ctx->block->instructions.emplace_back(std::move(tex));
6721
6722 if (tg4_integer_cube_workaround) {
6723 assert(tmp_dst.id() != dst.id());
6724 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6725
6726 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6727 Temp val[4];
6728 for (unsigned i = 0; i < dst.size(); i++) {
6729 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6730 Temp cvt_val;
6731 if (stype == GLSL_TYPE_UINT)
6732 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6733 else
6734 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6735 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6736 }
6737 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6738 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6739 val[0], val[1], val[2], val[3]);
6740 }
6741 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6742 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6743
6744 }
6745
6746
6747 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6748 {
6749 Temp tmp = get_ssa_temp(ctx, ssa);
6750 if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6751 return Operand(tmp.regClass());
6752 else
6753 return Operand(tmp);
6754 }
6755
6756 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6757 {
6758 aco_ptr<Pseudo_instruction> phi;
6759 unsigned num_src = exec_list_length(&instr->srcs);
6760 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6761
6762 aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6763
6764 std::map<unsigned, nir_ssa_def*> phi_src;
6765 bool all_undef = true;
6766 nir_foreach_phi_src(src, instr) {
6767 phi_src[src->pred->index] = src->src.ssa;
6768 if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6769 all_undef = false;
6770 }
6771 if (all_undef) {
6772 Builder bld(ctx->program, ctx->block);
6773 if (dst.regClass() == s1) {
6774 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6775 } else if (dst.regClass() == v1) {
6776 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6777 } else {
6778 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6779 for (unsigned i = 0; i < dst.size(); i++)
6780 vec->operands[i] = Operand(0u);
6781 vec->definitions[0] = Definition(dst);
6782 ctx->block->instructions.emplace_back(std::move(vec));
6783 }
6784 return;
6785 }
6786
6787 /* try to scalarize vector phis */
6788 if (dst.size() > 1) {
6789 // TODO: scalarize linear phis on divergent ifs
6790 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6791 std::array<Temp, 4> new_vec;
6792 for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6793 Operand src = get_phi_operand(ctx, pair.second);
6794 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6795 can_scalarize = false;
6796 break;
6797 }
6798 }
6799 if (can_scalarize) {
6800 unsigned num_components = instr->dest.ssa.num_components;
6801 assert(dst.size() % num_components == 0);
6802 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6803
6804 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6805 for (unsigned k = 0; k < num_components; k++) {
6806 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6807 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6808 for (unsigned i = 0; i < num_src; i++) {
6809 Operand src = get_phi_operand(ctx, it->second);
6810 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6811 ++it;
6812 }
6813 Temp phi_dst = {ctx->program->allocateId(), rc};
6814 phi->definitions[0] = Definition(phi_dst);
6815 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6816 new_vec[k] = phi_dst;
6817 vec->operands[k] = Operand(phi_dst);
6818 }
6819 vec->definitions[0] = Definition(dst);
6820 ctx->block->instructions.emplace_back(std::move(vec));
6821 ctx->allocated_vec.emplace(dst.id(), new_vec);
6822 return;
6823 }
6824 }
6825
6826 unsigned extra_src = 0;
6827 if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6828 ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6829 extra_src++;
6830 }
6831
6832 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6833
6834 /* if we have a linear phi on a divergent if, we know that one src is undef */
6835 if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6836 assert(extra_src == 0);
6837 Block* block;
6838 /* we place the phi either in the invert-block or in the current block */
6839 if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6840 assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6841 Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6842 block = &ctx->program->blocks[linear_else.linear_preds[0]];
6843 assert(block->kind & block_kind_invert);
6844 phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6845 } else {
6846 assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6847 block = ctx->block;
6848 phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6849 }
6850 phi->operands[1] = Operand(dst.regClass());
6851 phi->definitions[0] = Definition(dst);
6852 block->instructions.emplace(block->instructions.begin(), std::move(phi));
6853 return;
6854 }
6855
6856 std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6857 for (unsigned i = 0; i < num_src; i++) {
6858 phi->operands[i] = get_phi_operand(ctx, it->second);
6859 ++it;
6860 }
6861 for (unsigned i = 0; i < extra_src; i++)
6862 phi->operands[num_src + i] = Operand(dst.regClass());
6863 phi->definitions[0] = Definition(dst);
6864 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6865 }
6866
6867
6868 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6869 {
6870 Temp dst = get_ssa_temp(ctx, &instr->def);
6871
6872 assert(dst.type() == RegType::sgpr);
6873
6874 if (dst.size() == 1) {
6875 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6876 } else {
6877 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6878 for (unsigned i = 0; i < dst.size(); i++)
6879 vec->operands[i] = Operand(0u);
6880 vec->definitions[0] = Definition(dst);
6881 ctx->block->instructions.emplace_back(std::move(vec));
6882 }
6883 }
6884
6885 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6886 {
6887 Builder bld(ctx->program, ctx->block);
6888 Block *logical_target;
6889 append_logical_end(ctx->block);
6890 unsigned idx = ctx->block->index;
6891
6892 switch (instr->type) {
6893 case nir_jump_break:
6894 logical_target = ctx->cf_info.parent_loop.exit;
6895 add_logical_edge(idx, logical_target);
6896 ctx->block->kind |= block_kind_break;
6897
6898 if (!ctx->cf_info.parent_if.is_divergent &&
6899 !ctx->cf_info.parent_loop.has_divergent_continue) {
6900 /* uniform break - directly jump out of the loop */
6901 ctx->block->kind |= block_kind_uniform;
6902 ctx->cf_info.has_branch = true;
6903 bld.branch(aco_opcode::p_branch);
6904 add_linear_edge(idx, logical_target);
6905 return;
6906 }
6907 ctx->cf_info.parent_loop.has_divergent_branch = true;
6908 break;
6909 case nir_jump_continue:
6910 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6911 add_logical_edge(idx, logical_target);
6912 ctx->block->kind |= block_kind_continue;
6913
6914 if (ctx->cf_info.parent_if.is_divergent) {
6915 /* for potential uniform breaks after this continue,
6916 we must ensure that they are handled correctly */
6917 ctx->cf_info.parent_loop.has_divergent_continue = true;
6918 ctx->cf_info.parent_loop.has_divergent_branch = true;
6919 } else {
6920 /* uniform continue - directly jump to the loop header */
6921 ctx->block->kind |= block_kind_uniform;
6922 ctx->cf_info.has_branch = true;
6923 bld.branch(aco_opcode::p_branch);
6924 add_linear_edge(idx, logical_target);
6925 return;
6926 }
6927 break;
6928 default:
6929 fprintf(stderr, "Unknown NIR jump instr: ");
6930 nir_print_instr(&instr->instr, stderr);
6931 fprintf(stderr, "\n");
6932 abort();
6933 }
6934
6935 /* remove critical edges from linear CFG */
6936 bld.branch(aco_opcode::p_branch);
6937 Block* break_block = ctx->program->create_and_insert_block();
6938 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6939 break_block->kind |= block_kind_uniform;
6940 add_linear_edge(idx, break_block);
6941 /* the loop_header pointer might be invalidated by this point */
6942 if (instr->type == nir_jump_continue)
6943 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6944 add_linear_edge(break_block->index, logical_target);
6945 bld.reset(break_block);
6946 bld.branch(aco_opcode::p_branch);
6947
6948 Block* continue_block = ctx->program->create_and_insert_block();
6949 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6950 add_linear_edge(idx, continue_block);
6951 append_logical_start(continue_block);
6952 ctx->block = continue_block;
6953 return;
6954 }
6955
6956 void visit_block(isel_context *ctx, nir_block *block)
6957 {
6958 nir_foreach_instr(instr, block) {
6959 switch (instr->type) {
6960 case nir_instr_type_alu:
6961 visit_alu_instr(ctx, nir_instr_as_alu(instr));
6962 break;
6963 case nir_instr_type_load_const:
6964 visit_load_const(ctx, nir_instr_as_load_const(instr));
6965 break;
6966 case nir_instr_type_intrinsic:
6967 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6968 break;
6969 case nir_instr_type_tex:
6970 visit_tex(ctx, nir_instr_as_tex(instr));
6971 break;
6972 case nir_instr_type_phi:
6973 visit_phi(ctx, nir_instr_as_phi(instr));
6974 break;
6975 case nir_instr_type_ssa_undef:
6976 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6977 break;
6978 case nir_instr_type_deref:
6979 break;
6980 case nir_instr_type_jump:
6981 visit_jump(ctx, nir_instr_as_jump(instr));
6982 break;
6983 default:
6984 fprintf(stderr, "Unknown NIR instr type: ");
6985 nir_print_instr(instr, stderr);
6986 fprintf(stderr, "\n");
6987 //abort();
6988 }
6989 }
6990 }
6991
6992
6993
6994 static void visit_loop(isel_context *ctx, nir_loop *loop)
6995 {
6996 append_logical_end(ctx->block);
6997 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6998 Builder bld(ctx->program, ctx->block);
6999 bld.branch(aco_opcode::p_branch);
7000 unsigned loop_preheader_idx = ctx->block->index;
7001
7002 Block loop_exit = Block();
7003 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7004 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
7005
7006 Block* loop_header = ctx->program->create_and_insert_block();
7007 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
7008 loop_header->kind |= block_kind_loop_header;
7009 add_edge(loop_preheader_idx, loop_header);
7010 ctx->block = loop_header;
7011
7012 /* emit loop body */
7013 unsigned loop_header_idx = loop_header->index;
7014 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
7015 append_logical_start(ctx->block);
7016 visit_cf_list(ctx, &loop->body);
7017
7018 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
7019 if (!ctx->cf_info.has_branch) {
7020 append_logical_end(ctx->block);
7021 if (ctx->cf_info.exec_potentially_empty) {
7022 /* Discards can result in code running with an empty exec mask.
7023 * This would result in divergent breaks not ever being taken. As a
7024 * workaround, break the loop when the loop mask is empty instead of
7025 * always continuing. */
7026 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
7027
7028 /* create "loop_almost_exit" to avoid critical edges */
7029 unsigned block_idx = ctx->block->index;
7030 Block *loop_almost_exit = ctx->program->create_and_insert_block();
7031 loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7032 loop_almost_exit->kind = block_kind_uniform;
7033 bld.reset(loop_almost_exit);
7034 bld.branch(aco_opcode::p_branch);
7035
7036 add_linear_edge(block_idx, loop_almost_exit);
7037 add_linear_edge(loop_almost_exit->index, &loop_exit);
7038
7039 ctx->block = &ctx->program->blocks[block_idx];
7040 } else {
7041 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
7042 }
7043 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7044 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7045 else
7046 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7047 bld.reset(ctx->block);
7048 bld.branch(aco_opcode::p_branch);
7049 }
7050
7051 /* fixup phis in loop header from unreachable blocks */
7052 if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
7053 bool linear = ctx->cf_info.has_branch;
7054 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
7055 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
7056 if ((logical && instr->opcode == aco_opcode::p_phi) ||
7057 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
7058 /* the last operand should be the one that needs to be removed */
7059 instr->operands.pop_back();
7060 } else if (!is_phi(instr)) {
7061 break;
7062 }
7063 }
7064 }
7065
7066 ctx->cf_info.has_branch = false;
7067
7068 // TODO: if the loop has not a single exit, we must add one °°
7069 /* emit loop successor block */
7070 ctx->block = ctx->program->insert_block(std::move(loop_exit));
7071 append_logical_start(ctx->block);
7072
7073 #if 0
7074 // TODO: check if it is beneficial to not branch on continues
7075 /* trim linear phis in loop header */
7076 for (auto&& instr : loop_entry->instructions) {
7077 if (instr->opcode == aco_opcode::p_linear_phi) {
7078 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7079 new_phi->definitions[0] = instr->definitions[0];
7080 for (unsigned i = 0; i < new_phi->operands.size(); i++)
7081 new_phi->operands[i] = instr->operands[i];
7082 /* check that the remaining operands are all the same */
7083 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7084 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7085 instr.swap(new_phi);
7086 } else if (instr->opcode == aco_opcode::p_phi) {
7087 continue;
7088 } else {
7089 break;
7090 }
7091 }
7092 #endif
7093 }
7094
7095 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7096 {
7097 ic->cond = cond;
7098
7099 append_logical_end(ctx->block);
7100 ctx->block->kind |= block_kind_branch;
7101
7102 /* branch to linear then block */
7103 assert(cond.regClass() == s2);
7104 aco_ptr<Pseudo_branch_instruction> branch;
7105 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7106 branch->operands[0] = Operand(cond);
7107 ctx->block->instructions.push_back(std::move(branch));
7108
7109 ic->BB_if_idx = ctx->block->index;
7110 ic->BB_invert = Block();
7111 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7112 /* Invert blocks are intentionally not marked as top level because they
7113 * are not part of the logical cfg. */
7114 ic->BB_invert.kind |= block_kind_invert;
7115 ic->BB_endif = Block();
7116 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7117 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7118
7119 ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7120 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7121 ctx->cf_info.parent_if.is_divergent = true;
7122 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7123
7124 /** emit logical then block */
7125 Block* BB_then_logical = ctx->program->create_and_insert_block();
7126 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7127 add_edge(ic->BB_if_idx, BB_then_logical);
7128 ctx->block = BB_then_logical;
7129 append_logical_start(BB_then_logical);
7130 }
7131
7132 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7133 {
7134 Block *BB_then_logical = ctx->block;
7135 append_logical_end(BB_then_logical);
7136 /* branch from logical then block to invert block */
7137 aco_ptr<Pseudo_branch_instruction> branch;
7138 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7139 BB_then_logical->instructions.emplace_back(std::move(branch));
7140 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7141 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7142 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7143 BB_then_logical->kind |= block_kind_uniform;
7144 assert(!ctx->cf_info.has_branch);
7145 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7146 ctx->cf_info.parent_loop.has_divergent_branch = false;
7147
7148 /** emit linear then block */
7149 Block* BB_then_linear = ctx->program->create_and_insert_block();
7150 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7151 BB_then_linear->kind |= block_kind_uniform;
7152 add_linear_edge(ic->BB_if_idx, BB_then_linear);
7153 /* branch from linear then block to invert block */
7154 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7155 BB_then_linear->instructions.emplace_back(std::move(branch));
7156 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7157
7158 /** emit invert merge block */
7159 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7160 ic->invert_idx = ctx->block->index;
7161
7162 /* branch to linear else block (skip else) */
7163 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7164 branch->operands[0] = Operand(ic->cond);
7165 ctx->block->instructions.push_back(std::move(branch));
7166
7167 ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7168 ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7169
7170 /** emit logical else block */
7171 Block* BB_else_logical = ctx->program->create_and_insert_block();
7172 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7173 add_logical_edge(ic->BB_if_idx, BB_else_logical);
7174 add_linear_edge(ic->invert_idx, BB_else_logical);
7175 ctx->block = BB_else_logical;
7176 append_logical_start(BB_else_logical);
7177 }
7178
7179 static void end_divergent_if(isel_context *ctx, if_context *ic)
7180 {
7181 Block *BB_else_logical = ctx->block;
7182 append_logical_end(BB_else_logical);
7183
7184 /* branch from logical else block to endif block */
7185 aco_ptr<Pseudo_branch_instruction> branch;
7186 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7187 BB_else_logical->instructions.emplace_back(std::move(branch));
7188 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7189 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7190 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7191 BB_else_logical->kind |= block_kind_uniform;
7192
7193 assert(!ctx->cf_info.has_branch);
7194 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7195
7196
7197 /** emit linear else block */
7198 Block* BB_else_linear = ctx->program->create_and_insert_block();
7199 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7200 BB_else_linear->kind |= block_kind_uniform;
7201 add_linear_edge(ic->invert_idx, BB_else_linear);
7202
7203 /* branch from linear else block to endif block */
7204 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7205 BB_else_linear->instructions.emplace_back(std::move(branch));
7206 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7207
7208
7209 /** emit endif merge block */
7210 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7211 append_logical_start(ctx->block);
7212
7213
7214 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7215 ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7216 /* uniform control flow never has an empty exec-mask */
7217 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7218 ctx->cf_info.exec_potentially_empty = false;
7219 }
7220
7221 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7222 {
7223 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7224 Builder bld(ctx->program, ctx->block);
7225 aco_ptr<Pseudo_branch_instruction> branch;
7226
7227 if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7228 /**
7229 * Uniform conditionals are represented in the following way*) :
7230 *
7231 * The linear and logical CFG:
7232 * BB_IF
7233 * / \
7234 * BB_THEN (logical) BB_ELSE (logical)
7235 * \ /
7236 * BB_ENDIF
7237 *
7238 * *) Exceptions may be due to break and continue statements within loops
7239 * If a break/continue happens within uniform control flow, it branches
7240 * to the loop exit/entry block. Otherwise, it branches to the next
7241 * merge block.
7242 **/
7243 append_logical_end(ctx->block);
7244 ctx->block->kind |= block_kind_uniform;
7245
7246 /* emit branch */
7247 if (cond.regClass() == s2) {
7248 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7249 cond = as_uniform_bool(ctx, cond);
7250 }
7251 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7252 branch->operands[0] = Operand(cond);
7253 branch->operands[0].setFixed(scc);
7254 ctx->block->instructions.emplace_back(std::move(branch));
7255
7256 unsigned BB_if_idx = ctx->block->index;
7257 Block BB_endif = Block();
7258 BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7259 BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7260
7261 /** emit then block */
7262 Block* BB_then = ctx->program->create_and_insert_block();
7263 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7264 add_edge(BB_if_idx, BB_then);
7265 append_logical_start(BB_then);
7266 ctx->block = BB_then;
7267 visit_cf_list(ctx, &if_stmt->then_list);
7268 BB_then = ctx->block;
7269 bool then_branch = ctx->cf_info.has_branch;
7270 bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7271
7272 if (!then_branch) {
7273 append_logical_end(BB_then);
7274 /* branch from then block to endif block */
7275 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7276 BB_then->instructions.emplace_back(std::move(branch));
7277 add_linear_edge(BB_then->index, &BB_endif);
7278 if (!then_branch_divergent)
7279 add_logical_edge(BB_then->index, &BB_endif);
7280 BB_then->kind |= block_kind_uniform;
7281 }
7282
7283 ctx->cf_info.has_branch = false;
7284 ctx->cf_info.parent_loop.has_divergent_branch = false;
7285
7286 /** emit else block */
7287 Block* BB_else = ctx->program->create_and_insert_block();
7288 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7289 add_edge(BB_if_idx, BB_else);
7290 append_logical_start(BB_else);
7291 ctx->block = BB_else;
7292 visit_cf_list(ctx, &if_stmt->else_list);
7293 BB_else = ctx->block;
7294
7295 if (!ctx->cf_info.has_branch) {
7296 append_logical_end(BB_else);
7297 /* branch from then block to endif block */
7298 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7299 BB_else->instructions.emplace_back(std::move(branch));
7300 add_linear_edge(BB_else->index, &BB_endif);
7301 if (!ctx->cf_info.parent_loop.has_divergent_branch)
7302 add_logical_edge(BB_else->index, &BB_endif);
7303 BB_else->kind |= block_kind_uniform;
7304 }
7305
7306 ctx->cf_info.has_branch &= then_branch;
7307 ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7308
7309 /** emit endif merge block */
7310 if (!ctx->cf_info.has_branch) {
7311 ctx->block = ctx->program->insert_block(std::move(BB_endif));
7312 append_logical_start(ctx->block);
7313 }
7314 } else { /* non-uniform condition */
7315 /**
7316 * To maintain a logical and linear CFG without critical edges,
7317 * non-uniform conditionals are represented in the following way*) :
7318 *
7319 * The linear CFG:
7320 * BB_IF
7321 * / \
7322 * BB_THEN (logical) BB_THEN (linear)
7323 * \ /
7324 * BB_INVERT (linear)
7325 * / \
7326 * BB_ELSE (logical) BB_ELSE (linear)
7327 * \ /
7328 * BB_ENDIF
7329 *
7330 * The logical CFG:
7331 * BB_IF
7332 * / \
7333 * BB_THEN (logical) BB_ELSE (logical)
7334 * \ /
7335 * BB_ENDIF
7336 *
7337 * *) Exceptions may be due to break and continue statements within loops
7338 **/
7339
7340 if_context ic;
7341
7342 begin_divergent_if_then(ctx, &ic, cond);
7343 visit_cf_list(ctx, &if_stmt->then_list);
7344
7345 begin_divergent_if_else(ctx, &ic);
7346 visit_cf_list(ctx, &if_stmt->else_list);
7347
7348 end_divergent_if(ctx, &ic);
7349 }
7350 }
7351
7352 static void visit_cf_list(isel_context *ctx,
7353 struct exec_list *list)
7354 {
7355 foreach_list_typed(nir_cf_node, node, node, list) {
7356 switch (node->type) {
7357 case nir_cf_node_block:
7358 visit_block(ctx, nir_cf_node_as_block(node));
7359 break;
7360 case nir_cf_node_if:
7361 visit_if(ctx, nir_cf_node_as_if(node));
7362 break;
7363 case nir_cf_node_loop:
7364 visit_loop(ctx, nir_cf_node_as_loop(node));
7365 break;
7366 default:
7367 unreachable("unimplemented cf list type");
7368 }
7369 }
7370 }
7371
7372 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7373 {
7374 int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7375 uint64_t mask = ctx->vs_output.mask[slot];
7376 if (!is_pos && !mask)
7377 return;
7378 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7379 return;
7380 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7381 exp->enabled_mask = mask;
7382 for (unsigned i = 0; i < 4; ++i) {
7383 if (mask & (1 << i))
7384 exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7385 else
7386 exp->operands[i] = Operand(v1);
7387 }
7388 exp->valid_mask = false;
7389 exp->done = false;
7390 exp->compressed = false;
7391 if (is_pos)
7392 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7393 else
7394 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7395 ctx->block->instructions.emplace_back(std::move(exp));
7396 }
7397
7398 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7399 {
7400 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7401 exp->enabled_mask = 0;
7402 for (unsigned i = 0; i < 4; ++i)
7403 exp->operands[i] = Operand(v1);
7404 if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7405 exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7406 exp->enabled_mask |= 0x1;
7407 }
7408 if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7409 exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7410 exp->enabled_mask |= 0x4;
7411 }
7412 if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7413 if (ctx->options->chip_class < GFX9) {
7414 exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7415 exp->enabled_mask |= 0x8;
7416 } else {
7417 Builder bld(ctx->program, ctx->block);
7418
7419 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7420 Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7421 if (exp->operands[2].isTemp())
7422 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7423
7424 exp->operands[2] = Operand(out);
7425 exp->enabled_mask |= 0x4;
7426 }
7427 }
7428 exp->valid_mask = false;
7429 exp->done = false;
7430 exp->compressed = false;
7431 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7432 ctx->block->instructions.emplace_back(std::move(exp));
7433 }
7434
7435 static void create_vs_exports(isel_context *ctx)
7436 {
7437 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7438
7439 if (outinfo->export_prim_id) {
7440 ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7441 ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7442 }
7443
7444 if (ctx->options->key.has_multiview_view_index) {
7445 ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7446 ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7447 }
7448
7449 /* the order these position exports are created is important */
7450 int next_pos = 0;
7451 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7452 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7453 export_vs_psiz_layer_viewport(ctx, &next_pos);
7454 }
7455 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7456 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7457 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7458 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7459
7460 if (ctx->options->key.vs_common_out.export_clip_dists) {
7461 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7462 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7463 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7464 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7465 }
7466
7467 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7468 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7469 i != VARYING_SLOT_PRIMITIVE_ID)
7470 continue;
7471
7472 export_vs_varying(ctx, i, false, NULL);
7473 }
7474 }
7475
7476 static void emit_stream_output(isel_context *ctx,
7477 Temp const *so_buffers,
7478 Temp const *so_write_offset,
7479 const struct radv_stream_output *output)
7480 {
7481 unsigned num_comps = util_bitcount(output->component_mask);
7482 unsigned loc = output->location;
7483 unsigned buf = output->buffer;
7484 unsigned offset = output->offset;
7485
7486 assert(num_comps && num_comps <= 4);
7487 if (!num_comps || num_comps > 4)
7488 return;
7489
7490 unsigned start = ffs(output->component_mask) - 1;
7491
7492 Temp out[4];
7493 bool all_undef = true;
7494 assert(ctx->stage == vertex_vs);
7495 for (unsigned i = 0; i < num_comps; i++) {
7496 out[i] = ctx->vs_output.outputs[loc][start + i];
7497 all_undef = all_undef && !out[i].id();
7498 }
7499 if (all_undef)
7500 return;
7501
7502 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7503 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7504 for (unsigned i = 0; i < num_comps; ++i)
7505 vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7506 vec->definitions[0] = Definition(write_data);
7507 ctx->block->instructions.emplace_back(std::move(vec));
7508
7509 aco_opcode opcode;
7510 switch (num_comps) {
7511 case 1:
7512 opcode = aco_opcode::buffer_store_dword;
7513 break;
7514 case 2:
7515 opcode = aco_opcode::buffer_store_dwordx2;
7516 break;
7517 case 3:
7518 opcode = aco_opcode::buffer_store_dwordx3;
7519 break;
7520 case 4:
7521 opcode = aco_opcode::buffer_store_dwordx4;
7522 break;
7523 }
7524
7525 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7526 store->operands[0] = Operand(so_write_offset[buf]);
7527 store->operands[1] = Operand(so_buffers[buf]);
7528 store->operands[2] = Operand((uint32_t) 0);
7529 store->operands[3] = Operand(write_data);
7530 if (offset > 4095) {
7531 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7532 Builder bld(ctx->program, ctx->block);
7533 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7534 } else {
7535 store->offset = offset;
7536 }
7537 store->offen = true;
7538 store->glc = true;
7539 store->dlc = false;
7540 store->slc = true;
7541 store->can_reorder = true;
7542 ctx->block->instructions.emplace_back(std::move(store));
7543 }
7544
7545 static void emit_streamout(isel_context *ctx, unsigned stream)
7546 {
7547 Builder bld(ctx->program, ctx->block);
7548
7549 Temp so_buffers[4];
7550 Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7551 for (unsigned i = 0; i < 4; i++) {
7552 unsigned stride = ctx->program->info->so.strides[i];
7553 if (!stride)
7554 continue;
7555
7556 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7557 }
7558
7559 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7560 ctx->streamout_config, Operand(0x70010u));
7561
7562 Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7563 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7564
7565 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7566
7567 if_context ic;
7568 begin_divergent_if_then(ctx, &ic, can_emit);
7569
7570 bld.reset(ctx->block);
7571
7572 Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7573
7574 Temp so_write_offset[4];
7575
7576 for (unsigned i = 0; i < 4; i++) {
7577 unsigned stride = ctx->program->info->so.strides[i];
7578 if (!stride)
7579 continue;
7580
7581 if (stride == 1) {
7582 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7583 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7584 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7585
7586 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7587 } else {
7588 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7589 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7590 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7591 }
7592 }
7593
7594 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7595 struct radv_stream_output *output =
7596 &ctx->program->info->so.outputs[i];
7597 if (stream != output->stream)
7598 continue;
7599
7600 emit_stream_output(ctx, so_buffers, so_write_offset, output);
7601 }
7602
7603 begin_divergent_if_else(ctx, &ic);
7604 end_divergent_if(ctx, &ic);
7605 }
7606
7607 } /* end namespace */
7608
7609 void handle_bc_optimize(isel_context *ctx)
7610 {
7611 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7612 Builder bld(ctx->program, ctx->block);
7613 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7614 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7615 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7616 if (uses_center && uses_centroid) {
7617 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7618
7619 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7620 for (unsigned i = 0; i < 2; i++) {
7621 Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7622 ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7623 ctx->fs_inputs[fs_input::persp_center_p1 + i],
7624 sel);
7625 ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7626 }
7627 }
7628
7629 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7630 for (unsigned i = 0; i < 2; i++) {
7631 Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7632 ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7633 ctx->fs_inputs[fs_input::linear_center_p1 + i],
7634 sel);
7635 ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7636 }
7637 }
7638 }
7639 }
7640
7641 void select_program(Program *program,
7642 unsigned shader_count,
7643 struct nir_shader *const *shaders,
7644 ac_shader_config* config,
7645 struct radv_shader_info *info,
7646 struct radv_nir_compiler_options *options)
7647 {
7648 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7649
7650 for (unsigned i = 0; i < shader_count; i++) {
7651 nir_shader *nir = shaders[i];
7652 init_context(&ctx, nir);
7653
7654 if (!i) {
7655 add_startpgm(&ctx); /* needs to be after init_context() for FS */
7656 append_logical_start(ctx.block);
7657 }
7658
7659 if_context ic;
7660 if (shader_count >= 2) {
7661 Builder bld(ctx.program, ctx.block);
7662 Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7663 Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7664 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7665 Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7666
7667 begin_divergent_if_then(&ctx, &ic, cond);
7668 }
7669
7670 if (i) {
7671 Builder bld(ctx.program, ctx.block);
7672 bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7673 bld.sopp(aco_opcode::s_barrier);
7674 }
7675
7676 if (ctx.stage == fragment_fs)
7677 handle_bc_optimize(&ctx);
7678
7679 nir_function_impl *func = nir_shader_get_entrypoint(nir);
7680 visit_cf_list(&ctx, &func->body);
7681
7682 if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7683 emit_streamout(&ctx, 0);
7684
7685 if (ctx.stage == vertex_vs)
7686 create_vs_exports(&ctx);
7687
7688 if (shader_count >= 2) {
7689 begin_divergent_if_else(&ctx, &ic);
7690 end_divergent_if(&ctx, &ic);
7691 }
7692
7693 ralloc_free(ctx.divergent_vals);
7694 }
7695
7696 append_logical_end(ctx.block);
7697 ctx.block->kind |= block_kind_uniform;
7698 Builder bld(ctx.program, ctx.block);
7699 if (ctx.program->wb_smem_l1_on_end)
7700 bld.smem(aco_opcode::s_dcache_wb, false);
7701 bld.sopp(aco_opcode::s_endpgm);
7702
7703 /* cleanup CFG */
7704 for (Block& BB : program->blocks) {
7705 for (unsigned idx : BB.linear_preds)
7706 program->blocks[idx].linear_succs.emplace_back(BB.index);
7707 for (unsigned idx : BB.logical_preds)
7708 program->blocks[idx].logical_succs.emplace_back(BB.index);
7709 }
7710 }
7711 }