f8e4ba7e769102d3eff9a3bcd3fab14c9daf5ac9
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <stack>
29 #include <map>
30
31 #include "ac_shader_util.h"
32 #include "aco_ir.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
37
38 namespace aco {
39 namespace {
40
41 class loop_info_RAII {
42 isel_context* ctx;
43 unsigned header_idx_old;
44 Block* exit_old;
45 bool divergent_cont_old;
46 bool divergent_branch_old;
47 bool divergent_if_old;
48
49 public:
50 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
51 : ctx(ctx),
52 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
53 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
54 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
55 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
56 {
57 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
58 ctx->cf_info.parent_loop.exit = loop_exit;
59 ctx->cf_info.parent_loop.has_divergent_continue = false;
60 ctx->cf_info.parent_loop.has_divergent_branch = false;
61 ctx->cf_info.parent_if.is_divergent = false;
62 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
63 }
64
65 ~loop_info_RAII()
66 {
67 ctx->cf_info.parent_loop.header_idx = header_idx_old;
68 ctx->cf_info.parent_loop.exit = exit_old;
69 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
70 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
71 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
72 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
73 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
74 ctx->cf_info.exec_potentially_empty_discard = false;
75 }
76 };
77
78 struct if_context {
79 Temp cond;
80
81 bool divergent_old;
82 bool exec_potentially_empty_discard_old;
83 bool exec_potentially_empty_break_old;
84 uint16_t exec_potentially_empty_break_depth_old;
85
86 unsigned BB_if_idx;
87 unsigned invert_idx;
88 bool uniform_has_then_branch;
89 bool then_branch_divergent;
90 Block BB_invert;
91 Block BB_endif;
92 };
93
94 static bool visit_cf_list(struct isel_context *ctx,
95 struct exec_list *list);
96
97 static void add_logical_edge(unsigned pred_idx, Block *succ)
98 {
99 succ->logical_preds.emplace_back(pred_idx);
100 }
101
102
103 static void add_linear_edge(unsigned pred_idx, Block *succ)
104 {
105 succ->linear_preds.emplace_back(pred_idx);
106 }
107
108 static void add_edge(unsigned pred_idx, Block *succ)
109 {
110 add_logical_edge(pred_idx, succ);
111 add_linear_edge(pred_idx, succ);
112 }
113
114 static void append_logical_start(Block *b)
115 {
116 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
117 }
118
119 static void append_logical_end(Block *b)
120 {
121 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
122 }
123
124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
125 {
126 assert(ctx->allocated[def->index].id());
127 return ctx->allocated[def->index];
128 }
129
130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
131 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
132 {
133 Builder bld(ctx->program, ctx->block);
134 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
135 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
136
137 if (ctx->program->wave_size == 32) {
138 return thread_id_lo;
139 } else if (ctx->program->chip_class <= GFX7) {
140 Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
141 return thread_id_hi;
142 } else {
143 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
144 return thread_id_hi;
145 }
146 }
147
148 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
149 {
150 Builder bld(ctx->program, ctx->block);
151
152 if (!dst.id())
153 dst = bld.tmp(src.regClass());
154
155 assert(src.size() == dst.size());
156
157 if (ctx->stage != fragment_fs) {
158 if (!dst.id())
159 return src;
160
161 bld.copy(Definition(dst), src);
162 return dst;
163 }
164
165 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
166 ctx->program->needs_wqm |= program_needs_wqm;
167 return dst;
168 }
169
170 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
171 {
172 if (index.regClass() == s1)
173 return bld.readlane(bld.def(s1), data, index);
174
175 if (ctx->options->chip_class <= GFX7) {
176 /* GFX6-7: there is no bpermute instruction */
177 Operand index_op(index);
178 Operand input_data(data);
179 index_op.setLateKill(true);
180 input_data.setLateKill(true);
181
182 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
183 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
184 /* GFX10 wave64 mode: emulate full-wave bpermute */
185 if (!ctx->has_gfx10_wave64_bpermute) {
186 ctx->has_gfx10_wave64_bpermute = true;
187 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
188 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
189 }
190
191 Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
192 Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
193 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
194 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
195 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
196 Operand input_data(data);
197
198 index_x4.setLateKill(true);
199 input_data.setLateKill(true);
200 same_half.setLateKill(true);
201
202 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
203 } else {
204 /* GFX8-9 or GFX10 wave32: bpermute works normally */
205 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
206 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
207 }
208 }
209
210 Temp as_vgpr(isel_context *ctx, Temp val)
211 {
212 if (val.type() == RegType::sgpr) {
213 Builder bld(ctx->program, ctx->block);
214 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
215 }
216 assert(val.type() == RegType::vgpr);
217 return val;
218 }
219
220 //assumes a != 0xffffffff
221 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
222 {
223 assert(b != 0);
224 Builder bld(ctx->program, ctx->block);
225
226 if (util_is_power_of_two_or_zero(b)) {
227 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
228 return;
229 }
230
231 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
232
233 assert(info.multiplier <= 0xffffffff);
234
235 bool pre_shift = info.pre_shift != 0;
236 bool increment = info.increment != 0;
237 bool multiply = true;
238 bool post_shift = info.post_shift != 0;
239
240 if (!pre_shift && !increment && !multiply && !post_shift) {
241 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
242 return;
243 }
244
245 Temp pre_shift_dst = a;
246 if (pre_shift) {
247 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
248 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
249 }
250
251 Temp increment_dst = pre_shift_dst;
252 if (increment) {
253 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
254 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
255 }
256
257 Temp multiply_dst = increment_dst;
258 if (multiply) {
259 multiply_dst = post_shift ? bld.tmp(v1) : dst;
260 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
261 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
262 }
263
264 if (post_shift) {
265 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
266 }
267 }
268
269 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
270 {
271 Builder bld(ctx->program, ctx->block);
272 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
273 }
274
275
276 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
277 {
278 /* no need to extract the whole vector */
279 if (src.regClass() == dst_rc) {
280 assert(idx == 0);
281 return src;
282 }
283
284 assert(src.bytes() > (idx * dst_rc.bytes()));
285 Builder bld(ctx->program, ctx->block);
286 auto it = ctx->allocated_vec.find(src.id());
287 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
288 if (it->second[idx].regClass() == dst_rc) {
289 return it->second[idx];
290 } else {
291 assert(!dst_rc.is_subdword());
292 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
293 return bld.copy(bld.def(dst_rc), it->second[idx]);
294 }
295 }
296
297 if (dst_rc.is_subdword())
298 src = as_vgpr(ctx, src);
299
300 if (src.bytes() == dst_rc.bytes()) {
301 assert(idx == 0);
302 return bld.copy(bld.def(dst_rc), src);
303 } else {
304 Temp dst = bld.tmp(dst_rc);
305 emit_extract_vector(ctx, src, idx, dst);
306 return dst;
307 }
308 }
309
310 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
311 {
312 if (num_components == 1)
313 return;
314 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
315 return;
316 RegClass rc;
317 if (num_components > vec_src.size()) {
318 if (vec_src.type() == RegType::sgpr) {
319 /* should still help get_alu_src() */
320 emit_split_vector(ctx, vec_src, vec_src.size());
321 return;
322 }
323 /* sub-dword split */
324 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
325 } else {
326 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
327 }
328 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
329 split->operands[0] = Operand(vec_src);
330 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
331 for (unsigned i = 0; i < num_components; i++) {
332 elems[i] = {ctx->program->allocateId(), rc};
333 split->definitions[i] = Definition(elems[i]);
334 }
335 ctx->block->instructions.emplace_back(std::move(split));
336 ctx->allocated_vec.emplace(vec_src.id(), elems);
337 }
338
339 /* This vector expansion uses a mask to determine which elements in the new vector
340 * come from the original vector. The other elements are undefined. */
341 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
342 {
343 emit_split_vector(ctx, vec_src, util_bitcount(mask));
344
345 if (vec_src == dst)
346 return;
347
348 Builder bld(ctx->program, ctx->block);
349 if (num_components == 1) {
350 if (dst.type() == RegType::sgpr)
351 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
352 else
353 bld.copy(Definition(dst), vec_src);
354 return;
355 }
356
357 unsigned component_size = dst.size() / num_components;
358 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
359
360 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
361 vec->definitions[0] = Definition(dst);
362 unsigned k = 0;
363 for (unsigned i = 0; i < num_components; i++) {
364 if (mask & (1 << i)) {
365 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
366 if (dst.type() == RegType::sgpr)
367 src = bld.as_uniform(src);
368 vec->operands[i] = Operand(src);
369 } else {
370 vec->operands[i] = Operand(0u);
371 }
372 elems[i] = vec->operands[i].getTemp();
373 }
374 ctx->block->instructions.emplace_back(std::move(vec));
375 ctx->allocated_vec.emplace(dst.id(), elems);
376 }
377
378 /* adjust misaligned small bit size loads */
379 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
380 {
381 Builder bld(ctx->program, ctx->block);
382 Operand shift;
383 Temp select = Temp();
384 if (offset.isConstant()) {
385 assert(offset.constantValue() && offset.constantValue() < 4);
386 shift = Operand(offset.constantValue() * 8);
387 } else {
388 /* bit_offset = 8 * (offset & 0x3) */
389 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
390 select = bld.tmp(s1);
391 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
392 }
393
394 if (vec.size() == 1) {
395 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
396 } else if (vec.size() == 2) {
397 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
398 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
399 if (tmp == dst)
400 emit_split_vector(ctx, dst, 2);
401 else
402 emit_extract_vector(ctx, tmp, 0, dst);
403 } else if (vec.size() == 4) {
404 Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
405 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
406 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
407 if (select != Temp())
408 hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
409 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
410 Temp mid = bld.tmp(s1);
411 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
412 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
413 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
414 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
415 emit_split_vector(ctx, dst, 2);
416 }
417 }
418
419 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
420 {
421 Builder bld(ctx->program, ctx->block);
422 if (offset.isTemp()) {
423 Temp tmp[4] = {vec, vec, vec, vec};
424
425 if (vec.size() == 4) {
426 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
427 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
428 } else if (vec.size() == 3) {
429 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
430 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
431 } else if (vec.size() == 2) {
432 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
433 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
434 }
435 for (unsigned i = 0; i < dst.size(); i++)
436 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
437
438 vec = tmp[0];
439 if (dst.size() == 2)
440 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
441
442 offset = Operand(0u);
443 }
444
445 unsigned num_components = dst.bytes() / component_size;
446 if (vec.regClass() == dst.regClass()) {
447 assert(offset.constantValue() == 0);
448 bld.copy(Definition(dst), vec);
449 emit_split_vector(ctx, dst, num_components);
450 return;
451 }
452
453 emit_split_vector(ctx, vec, vec.bytes() / component_size);
454 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
455 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
456
457 assert(offset.constantValue() % component_size == 0);
458 unsigned skip = offset.constantValue() / component_size;
459 for (unsigned i = 0; i < num_components; i++)
460 elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
461
462 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
463 if (dst.type() == RegType::vgpr) {
464 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
465 for (unsigned i = 0; i < num_components; i++)
466 create_vec->operands[i] = Operand(elems[i]);
467 create_vec->definitions[0] = Definition(dst);
468 bld.insert(std::move(create_vec));
469
470 /* if dst is sgpr - split the src, but move the original to sgpr. */
471 } else if (skip) {
472 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
473 byte_align_scalar(ctx, vec, offset, dst);
474 } else {
475 assert(dst.size() == vec.size());
476 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
477 }
478
479 ctx->allocated_vec.emplace(dst.id(), elems);
480 }
481
482 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
483 {
484 Builder bld(ctx->program, ctx->block);
485 if (!dst.id())
486 dst = bld.tmp(bld.lm);
487
488 assert(val.regClass() == s1);
489 assert(dst.regClass() == bld.lm);
490
491 return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
492 }
493
494 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
495 {
496 Builder bld(ctx->program, ctx->block);
497 if (!dst.id())
498 dst = bld.tmp(s1);
499
500 assert(val.regClass() == bld.lm);
501 assert(dst.regClass() == s1);
502
503 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
504 Temp tmp = bld.tmp(s1);
505 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
506 return emit_wqm(ctx, tmp, dst);
507 }
508
509 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
510 {
511 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
512 return get_ssa_temp(ctx, src.src.ssa);
513
514 if (src.src.ssa->num_components == size) {
515 bool identity_swizzle = true;
516 for (unsigned i = 0; identity_swizzle && i < size; i++) {
517 if (src.swizzle[i] != i)
518 identity_swizzle = false;
519 }
520 if (identity_swizzle)
521 return get_ssa_temp(ctx, src.src.ssa);
522 }
523
524 Temp vec = get_ssa_temp(ctx, src.src.ssa);
525 unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
526 assert(elem_size > 0);
527 assert(vec.bytes() % elem_size == 0);
528
529 if (elem_size < 4 && vec.type() == RegType::sgpr) {
530 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
531 assert(size == 1);
532 unsigned swizzle = src.swizzle[0];
533 if (vec.size() > 1) {
534 assert(src.src.ssa->bit_size == 16);
535 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
536 swizzle = swizzle & 1;
537 }
538 if (swizzle == 0)
539 return vec;
540
541 Temp dst{ctx->program->allocateId(), s1};
542 aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
543 bfe->operands[0] = Operand(vec);
544 bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
545 bfe->definitions[0] = Definition(dst);
546 bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
547 ctx->block->instructions.emplace_back(std::move(bfe));
548 return dst;
549 }
550
551 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
552 if (size == 1) {
553 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
554 } else {
555 assert(size <= 4);
556 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
557 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
558 for (unsigned i = 0; i < size; ++i) {
559 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
560 vec_instr->operands[i] = Operand{elems[i]};
561 }
562 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
563 vec_instr->definitions[0] = Definition(dst);
564 ctx->block->instructions.emplace_back(std::move(vec_instr));
565 ctx->allocated_vec.emplace(dst.id(), elems);
566 return dst;
567 }
568 }
569
570 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
571 {
572 if (ptr.size() == 2)
573 return ptr;
574 Builder bld(ctx->program, ctx->block);
575 if (ptr.type() == RegType::vgpr)
576 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
577 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
578 ptr, Operand((unsigned)ctx->options->address32_hi));
579 }
580
581 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
582 {
583 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
584 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
585 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
586 sop2->definitions[0] = Definition(dst);
587 if (writes_scc)
588 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
589 ctx->block->instructions.emplace_back(std::move(sop2));
590 }
591
592 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
593 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
594 {
595 Builder bld(ctx->program, ctx->block);
596 bld.is_precise = instr->exact;
597
598 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
599 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
600 if (src1.type() == RegType::sgpr) {
601 if (commutative && src0.type() == RegType::vgpr) {
602 Temp t = src0;
603 src0 = src1;
604 src1 = t;
605 } else {
606 src1 = as_vgpr(ctx, src1);
607 }
608 }
609
610 if (flush_denorms && ctx->program->chip_class < GFX9) {
611 assert(dst.size() == 1);
612 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
613 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
614 } else {
615 bld.vop2(op, Definition(dst), src0, src1);
616 }
617 }
618
619 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
620 aco_opcode op, Temp dst)
621 {
622 Builder bld(ctx->program, ctx->block);
623 bld.is_precise = instr->exact;
624
625 Temp src0 = get_alu_src(ctx, instr->src[0]);
626 Temp src1 = get_alu_src(ctx, instr->src[1]);
627
628 if (src1.type() == RegType::sgpr) {
629 assert(src0.type() == RegType::vgpr);
630 std::swap(src0, src1);
631 }
632
633 Temp src00 = bld.tmp(src0.type(), 1);
634 Temp src01 = bld.tmp(src0.type(), 1);
635 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
636 Temp src10 = bld.tmp(v1);
637 Temp src11 = bld.tmp(v1);
638 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
639 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
640 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
641 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
642 }
643
644 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
645 bool flush_denorms = false)
646 {
647 Temp src0 = get_alu_src(ctx, instr->src[0]);
648 Temp src1 = get_alu_src(ctx, instr->src[1]);
649 Temp src2 = get_alu_src(ctx, instr->src[2]);
650
651 /* ensure that the instruction has at most 1 sgpr operand
652 * The optimizer will inline constants for us */
653 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
654 src0 = as_vgpr(ctx, src0);
655 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
656 src1 = as_vgpr(ctx, src1);
657 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
658 src2 = as_vgpr(ctx, src2);
659
660 Builder bld(ctx->program, ctx->block);
661 bld.is_precise = instr->exact;
662 if (flush_denorms && ctx->program->chip_class < GFX9) {
663 assert(dst.size() == 1);
664 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
665 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
666 } else {
667 bld.vop3(op, Definition(dst), src0, src1, src2);
668 }
669 }
670
671 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
672 {
673 Builder bld(ctx->program, ctx->block);
674 bld.is_precise = instr->exact;
675 if (dst.type() == RegType::sgpr)
676 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
677 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
678 else
679 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
680 }
681
682 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
683 {
684 Temp src0 = get_alu_src(ctx, instr->src[0]);
685 Temp src1 = get_alu_src(ctx, instr->src[1]);
686 assert(src0.size() == src1.size());
687
688 aco_ptr<Instruction> vopc;
689 if (src1.type() == RegType::sgpr) {
690 if (src0.type() == RegType::vgpr) {
691 /* to swap the operands, we might also have to change the opcode */
692 switch (op) {
693 case aco_opcode::v_cmp_lt_f16:
694 op = aco_opcode::v_cmp_gt_f16;
695 break;
696 case aco_opcode::v_cmp_ge_f16:
697 op = aco_opcode::v_cmp_le_f16;
698 break;
699 case aco_opcode::v_cmp_lt_i16:
700 op = aco_opcode::v_cmp_gt_i16;
701 break;
702 case aco_opcode::v_cmp_ge_i16:
703 op = aco_opcode::v_cmp_le_i16;
704 break;
705 case aco_opcode::v_cmp_lt_u16:
706 op = aco_opcode::v_cmp_gt_u16;
707 break;
708 case aco_opcode::v_cmp_ge_u16:
709 op = aco_opcode::v_cmp_le_u16;
710 break;
711 case aco_opcode::v_cmp_lt_f32:
712 op = aco_opcode::v_cmp_gt_f32;
713 break;
714 case aco_opcode::v_cmp_ge_f32:
715 op = aco_opcode::v_cmp_le_f32;
716 break;
717 case aco_opcode::v_cmp_lt_i32:
718 op = aco_opcode::v_cmp_gt_i32;
719 break;
720 case aco_opcode::v_cmp_ge_i32:
721 op = aco_opcode::v_cmp_le_i32;
722 break;
723 case aco_opcode::v_cmp_lt_u32:
724 op = aco_opcode::v_cmp_gt_u32;
725 break;
726 case aco_opcode::v_cmp_ge_u32:
727 op = aco_opcode::v_cmp_le_u32;
728 break;
729 case aco_opcode::v_cmp_lt_f64:
730 op = aco_opcode::v_cmp_gt_f64;
731 break;
732 case aco_opcode::v_cmp_ge_f64:
733 op = aco_opcode::v_cmp_le_f64;
734 break;
735 case aco_opcode::v_cmp_lt_i64:
736 op = aco_opcode::v_cmp_gt_i64;
737 break;
738 case aco_opcode::v_cmp_ge_i64:
739 op = aco_opcode::v_cmp_le_i64;
740 break;
741 case aco_opcode::v_cmp_lt_u64:
742 op = aco_opcode::v_cmp_gt_u64;
743 break;
744 case aco_opcode::v_cmp_ge_u64:
745 op = aco_opcode::v_cmp_le_u64;
746 break;
747 default: /* eq and ne are commutative */
748 break;
749 }
750 Temp t = src0;
751 src0 = src1;
752 src1 = t;
753 } else {
754 src1 = as_vgpr(ctx, src1);
755 }
756 }
757
758 Builder bld(ctx->program, ctx->block);
759 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
760 }
761
762 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
763 {
764 Temp src0 = get_alu_src(ctx, instr->src[0]);
765 Temp src1 = get_alu_src(ctx, instr->src[1]);
766 Builder bld(ctx->program, ctx->block);
767
768 assert(dst.regClass() == bld.lm);
769 assert(src0.type() == RegType::sgpr);
770 assert(src1.type() == RegType::sgpr);
771 assert(src0.regClass() == src1.regClass());
772
773 /* Emit the SALU comparison instruction */
774 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
775 /* Turn the result into a per-lane bool */
776 bool_to_vector_condition(ctx, cmp, dst);
777 }
778
779 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
780 aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
781 {
782 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
783 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
784 bool use_valu = s_op == aco_opcode::num_opcodes ||
785 nir_dest_is_divergent(instr->dest.dest) ||
786 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
787 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
788 aco_opcode op = use_valu ? v_op : s_op;
789 assert(op != aco_opcode::num_opcodes);
790 assert(dst.regClass() == ctx->program->lane_mask);
791
792 if (use_valu)
793 emit_vopc_instruction(ctx, instr, op, dst);
794 else
795 emit_sopc_instruction(ctx, instr, op, dst);
796 }
797
798 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
799 {
800 Builder bld(ctx->program, ctx->block);
801 Temp src0 = get_alu_src(ctx, instr->src[0]);
802 Temp src1 = get_alu_src(ctx, instr->src[1]);
803
804 assert(dst.regClass() == bld.lm);
805 assert(src0.regClass() == bld.lm);
806 assert(src1.regClass() == bld.lm);
807
808 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
809 }
810
811 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
812 {
813 Builder bld(ctx->program, ctx->block);
814 Temp cond = get_alu_src(ctx, instr->src[0]);
815 Temp then = get_alu_src(ctx, instr->src[1]);
816 Temp els = get_alu_src(ctx, instr->src[2]);
817
818 assert(cond.regClass() == bld.lm);
819
820 if (dst.type() == RegType::vgpr) {
821 aco_ptr<Instruction> bcsel;
822 if (dst.size() == 1) {
823 then = as_vgpr(ctx, then);
824 els = as_vgpr(ctx, els);
825
826 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
827 } else if (dst.size() == 2) {
828 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
829 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
830 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
831 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
832
833 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
834 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
835
836 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
837 } else {
838 fprintf(stderr, "Unimplemented NIR instr bit size: ");
839 nir_print_instr(&instr->instr, stderr);
840 fprintf(stderr, "\n");
841 }
842 return;
843 }
844
845 if (instr->dest.dest.ssa.bit_size == 1) {
846 assert(dst.regClass() == bld.lm);
847 assert(then.regClass() == bld.lm);
848 assert(els.regClass() == bld.lm);
849 }
850
851 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
852 if (dst.regClass() == s1 || dst.regClass() == s2) {
853 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
854 assert(dst.size() == then.size());
855 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
856 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
857 } else {
858 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
859 nir_print_instr(&instr->instr, stderr);
860 fprintf(stderr, "\n");
861 }
862 return;
863 }
864
865 /* divergent boolean bcsel
866 * this implements bcsel on bools: dst = s0 ? s1 : s2
867 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
868 assert(instr->dest.dest.ssa.bit_size == 1);
869
870 if (cond.id() != then.id())
871 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
872
873 if (cond.id() == els.id())
874 bld.sop1(Builder::s_mov, Definition(dst), then);
875 else
876 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
877 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
878 }
879
880 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
881 aco_opcode op, uint32_t undo)
882 {
883 /* multiply by 16777216 to handle denormals */
884 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
885 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
886 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
887 scaled = bld.vop1(op, bld.def(v1), scaled);
888 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
889
890 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
891
892 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
893 }
894
895 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
896 {
897 if (ctx->block->fp_mode.denorm32 == 0) {
898 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
899 return;
900 }
901
902 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
903 }
904
905 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
906 {
907 if (ctx->block->fp_mode.denorm32 == 0) {
908 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
909 return;
910 }
911
912 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
913 }
914
915 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
916 {
917 if (ctx->block->fp_mode.denorm32 == 0) {
918 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
919 return;
920 }
921
922 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
923 }
924
925 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
926 {
927 if (ctx->block->fp_mode.denorm32 == 0) {
928 bld.vop1(aco_opcode::v_log_f32, dst, val);
929 return;
930 }
931
932 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
933 }
934
935 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
936 {
937 if (ctx->options->chip_class >= GFX7)
938 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
939
940 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
941 /* TODO: create more efficient code! */
942 if (val.type() == RegType::sgpr)
943 val = as_vgpr(ctx, val);
944
945 /* Split the input value. */
946 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
947 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
948
949 /* Extract the exponent and compute the unbiased value. */
950 Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
951 exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
952
953 /* Extract the fractional part. */
954 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
955 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
956
957 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
958 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
959
960 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
961 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
962 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
963 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
964 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
965
966 /* Get the sign bit. */
967 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
968
969 /* Decide the operation to apply depending on the unbiased exponent. */
970 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
971 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
972 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
973 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
974 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
975 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
976
977 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
978 }
979
980 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
981 {
982 if (ctx->options->chip_class >= GFX7)
983 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
984
985 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
986 * lowered at NIR level for precision reasons). */
987 Temp src0 = as_vgpr(ctx, val);
988
989 Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
990 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
991
992 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
993 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
994 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
995
996 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
997 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
998 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
999 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1000
1001 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1002 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1003
1004 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1005
1006 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1007 static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1008
1009 return add->definitions[0].getTemp();
1010 }
1011
1012 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1013 if (!dst.id()) {
1014 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1015 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1016 else
1017 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1018 }
1019
1020 if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1021 return bld.copy(Definition(dst), src);
1022 else if (dst.bytes() < src.bytes())
1023 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1024
1025 Temp tmp = dst;
1026 if (dst_bits == 64)
1027 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1028
1029 if (tmp == src) {
1030 } else if (src.regClass() == s1) {
1031 if (is_signed)
1032 bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1033 else
1034 bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1035 } else if (ctx->options->chip_class >= GFX8) {
1036 assert(src_bits != 8 || src.regClass() == v1b);
1037 assert(src_bits != 16 || src.regClass() == v2b);
1038 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1039 sdwa->operands[0] = Operand(src);
1040 sdwa->definitions[0] = Definition(tmp);
1041 if (is_signed)
1042 sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1043 else
1044 sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1045 sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1046 bld.insert(std::move(sdwa));
1047 } else {
1048 assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1049 aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1050 bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1051 }
1052
1053 if (dst_bits == 64) {
1054 if (is_signed && dst.regClass() == s2) {
1055 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1056 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1057 } else if (is_signed && dst.regClass() == v2) {
1058 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1059 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1060 } else {
1061 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1062 }
1063 }
1064
1065 return dst;
1066 }
1067
1068 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1069 {
1070 if (!instr->dest.dest.is_ssa) {
1071 fprintf(stderr, "nir alu dst not in ssa: ");
1072 nir_print_instr(&instr->instr, stderr);
1073 fprintf(stderr, "\n");
1074 abort();
1075 }
1076 Builder bld(ctx->program, ctx->block);
1077 bld.is_precise = instr->exact;
1078 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1079 switch(instr->op) {
1080 case nir_op_vec2:
1081 case nir_op_vec3:
1082 case nir_op_vec4: {
1083 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1084 unsigned num = instr->dest.dest.ssa.num_components;
1085 for (unsigned i = 0; i < num; ++i)
1086 elems[i] = get_alu_src(ctx, instr->src[i]);
1087
1088 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1089 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1090 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1091 for (unsigned i = 0; i < num; ++i) {
1092 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1093 vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1094 else
1095 vec->operands[i] = Operand{elems[i]};
1096 }
1097 vec->definitions[0] = Definition(dst);
1098 ctx->block->instructions.emplace_back(std::move(vec));
1099 ctx->allocated_vec.emplace(dst.id(), elems);
1100 } else {
1101 // TODO: that is a bit suboptimal..
1102 Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1103 for (unsigned i = 0; i < num - 1; ++i)
1104 if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1105 elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1106 for (unsigned i = 0; i < num; ++i) {
1107 unsigned bit = i * instr->dest.dest.ssa.bit_size;
1108 if (bit % 32 == 0) {
1109 elems[bit / 32] = elems[i];
1110 } else {
1111 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1112 elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1113 elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1114 }
1115 }
1116 if (dst.size() == 1)
1117 bld.copy(Definition(dst), elems[0]);
1118 else
1119 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1120 }
1121 break;
1122 }
1123 case nir_op_mov: {
1124 Temp src = get_alu_src(ctx, instr->src[0]);
1125 aco_ptr<Instruction> mov;
1126 if (dst.type() == RegType::sgpr) {
1127 if (src.type() == RegType::vgpr)
1128 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1129 else if (src.regClass() == s1)
1130 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1131 else if (src.regClass() == s2)
1132 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1133 else
1134 unreachable("wrong src register class for nir_op_imov");
1135 } else {
1136 if (dst.regClass() == v1)
1137 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1138 else if (dst.regClass() == v1b ||
1139 dst.regClass() == v2b ||
1140 dst.regClass() == v2)
1141 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1142 else
1143 unreachable("wrong src register class for nir_op_imov");
1144 }
1145 break;
1146 }
1147 case nir_op_inot: {
1148 Temp src = get_alu_src(ctx, instr->src[0]);
1149 if (instr->dest.dest.ssa.bit_size == 1) {
1150 assert(src.regClass() == bld.lm);
1151 assert(dst.regClass() == bld.lm);
1152 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1153 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1154 bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1155 } else if (dst.regClass() == v1) {
1156 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1157 } else if (dst.regClass() == v2) {
1158 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1159 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1160 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1161 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1162 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1163 } else if (dst.type() == RegType::sgpr) {
1164 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1165 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1166 } else {
1167 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1168 nir_print_instr(&instr->instr, stderr);
1169 fprintf(stderr, "\n");
1170 }
1171 break;
1172 }
1173 case nir_op_ineg: {
1174 Temp src = get_alu_src(ctx, instr->src[0]);
1175 if (dst.regClass() == v1) {
1176 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1177 } else if (dst.regClass() == s1) {
1178 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1179 } else if (dst.size() == 2) {
1180 Temp src0 = bld.tmp(dst.type(), 1);
1181 Temp src1 = bld.tmp(dst.type(), 1);
1182 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1183
1184 if (dst.regClass() == s2) {
1185 Temp carry = bld.tmp(s1);
1186 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1187 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1188 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1189 } else {
1190 Temp lower = bld.tmp(v1);
1191 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1192 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1193 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1194 }
1195 } else {
1196 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1197 nir_print_instr(&instr->instr, stderr);
1198 fprintf(stderr, "\n");
1199 }
1200 break;
1201 }
1202 case nir_op_iabs: {
1203 if (dst.regClass() == s1) {
1204 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1205 } else if (dst.regClass() == v1) {
1206 Temp src = get_alu_src(ctx, instr->src[0]);
1207 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1208 } else {
1209 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1210 nir_print_instr(&instr->instr, stderr);
1211 fprintf(stderr, "\n");
1212 }
1213 break;
1214 }
1215 case nir_op_isign: {
1216 Temp src = get_alu_src(ctx, instr->src[0]);
1217 if (dst.regClass() == s1) {
1218 Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1219 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1220 } else if (dst.regClass() == s2) {
1221 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1222 Temp neqz;
1223 if (ctx->program->chip_class >= GFX8)
1224 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1225 else
1226 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1227 /* SCC gets zero-extended to 64 bit */
1228 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1229 } else if (dst.regClass() == v1) {
1230 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1231 } else if (dst.regClass() == v2) {
1232 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1233 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1234 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1235 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1236 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1237 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1238 } else {
1239 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1240 nir_print_instr(&instr->instr, stderr);
1241 fprintf(stderr, "\n");
1242 }
1243 break;
1244 }
1245 case nir_op_imax: {
1246 if (dst.regClass() == v1) {
1247 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1248 } else if (dst.regClass() == s1) {
1249 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1250 } else {
1251 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1252 nir_print_instr(&instr->instr, stderr);
1253 fprintf(stderr, "\n");
1254 }
1255 break;
1256 }
1257 case nir_op_umax: {
1258 if (dst.regClass() == v1) {
1259 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1260 } else if (dst.regClass() == s1) {
1261 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1262 } else {
1263 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1264 nir_print_instr(&instr->instr, stderr);
1265 fprintf(stderr, "\n");
1266 }
1267 break;
1268 }
1269 case nir_op_imin: {
1270 if (dst.regClass() == v1) {
1271 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1272 } else if (dst.regClass() == s1) {
1273 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1274 } else {
1275 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1276 nir_print_instr(&instr->instr, stderr);
1277 fprintf(stderr, "\n");
1278 }
1279 break;
1280 }
1281 case nir_op_umin: {
1282 if (dst.regClass() == v1) {
1283 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1284 } else if (dst.regClass() == s1) {
1285 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1286 } else {
1287 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1288 nir_print_instr(&instr->instr, stderr);
1289 fprintf(stderr, "\n");
1290 }
1291 break;
1292 }
1293 case nir_op_ior: {
1294 if (instr->dest.dest.ssa.bit_size == 1) {
1295 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1296 } else if (dst.regClass() == v1) {
1297 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1298 } else if (dst.regClass() == v2) {
1299 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1300 } else if (dst.regClass() == s1) {
1301 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1302 } else if (dst.regClass() == s2) {
1303 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1304 } else {
1305 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1306 nir_print_instr(&instr->instr, stderr);
1307 fprintf(stderr, "\n");
1308 }
1309 break;
1310 }
1311 case nir_op_iand: {
1312 if (instr->dest.dest.ssa.bit_size == 1) {
1313 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1314 } else if (dst.regClass() == v1) {
1315 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1316 } else if (dst.regClass() == v2) {
1317 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1318 } else if (dst.regClass() == s1) {
1319 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1320 } else if (dst.regClass() == s2) {
1321 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1322 } else {
1323 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1324 nir_print_instr(&instr->instr, stderr);
1325 fprintf(stderr, "\n");
1326 }
1327 break;
1328 }
1329 case nir_op_ixor: {
1330 if (instr->dest.dest.ssa.bit_size == 1) {
1331 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1332 } else if (dst.regClass() == v1) {
1333 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1334 } else if (dst.regClass() == v2) {
1335 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1336 } else if (dst.regClass() == s1) {
1337 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1338 } else if (dst.regClass() == s2) {
1339 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1340 } else {
1341 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1342 nir_print_instr(&instr->instr, stderr);
1343 fprintf(stderr, "\n");
1344 }
1345 break;
1346 }
1347 case nir_op_ushr: {
1348 if (dst.regClass() == v1) {
1349 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1350 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1351 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1352 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1353 } else if (dst.regClass() == v2) {
1354 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1355 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1356 } else if (dst.regClass() == s2) {
1357 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1358 } else if (dst.regClass() == s1) {
1359 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1360 } else {
1361 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1362 nir_print_instr(&instr->instr, stderr);
1363 fprintf(stderr, "\n");
1364 }
1365 break;
1366 }
1367 case nir_op_ishl: {
1368 if (dst.regClass() == v1) {
1369 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1370 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1371 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1372 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1373 } else if (dst.regClass() == v2) {
1374 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1375 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1376 } else if (dst.regClass() == s1) {
1377 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1378 } else if (dst.regClass() == s2) {
1379 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1380 } else {
1381 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1382 nir_print_instr(&instr->instr, stderr);
1383 fprintf(stderr, "\n");
1384 }
1385 break;
1386 }
1387 case nir_op_ishr: {
1388 if (dst.regClass() == v1) {
1389 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1390 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1391 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1392 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1393 } else if (dst.regClass() == v2) {
1394 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1395 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1396 } else if (dst.regClass() == s1) {
1397 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1398 } else if (dst.regClass() == s2) {
1399 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1400 } else {
1401 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1402 nir_print_instr(&instr->instr, stderr);
1403 fprintf(stderr, "\n");
1404 }
1405 break;
1406 }
1407 case nir_op_find_lsb: {
1408 Temp src = get_alu_src(ctx, instr->src[0]);
1409 if (src.regClass() == s1) {
1410 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1411 } else if (src.regClass() == v1) {
1412 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1413 } else if (src.regClass() == s2) {
1414 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1415 } else {
1416 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1417 nir_print_instr(&instr->instr, stderr);
1418 fprintf(stderr, "\n");
1419 }
1420 break;
1421 }
1422 case nir_op_ufind_msb:
1423 case nir_op_ifind_msb: {
1424 Temp src = get_alu_src(ctx, instr->src[0]);
1425 if (src.regClass() == s1 || src.regClass() == s2) {
1426 aco_opcode op = src.regClass() == s2 ?
1427 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1428 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1429 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1430
1431 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1432 Operand(src.size() * 32u - 1u), msb_rev);
1433 Temp msb = sub.def(0).getTemp();
1434 Temp carry = sub.def(1).getTemp();
1435
1436 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1437 } else if (src.regClass() == v1) {
1438 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1439 Temp msb_rev = bld.tmp(v1);
1440 emit_vop1_instruction(ctx, instr, op, msb_rev);
1441 Temp msb = bld.tmp(v1);
1442 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1443 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1444 } else {
1445 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1446 nir_print_instr(&instr->instr, stderr);
1447 fprintf(stderr, "\n");
1448 }
1449 break;
1450 }
1451 case nir_op_bitfield_reverse: {
1452 if (dst.regClass() == s1) {
1453 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1454 } else if (dst.regClass() == v1) {
1455 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1456 } else {
1457 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1458 nir_print_instr(&instr->instr, stderr);
1459 fprintf(stderr, "\n");
1460 }
1461 break;
1462 }
1463 case nir_op_iadd: {
1464 if (dst.regClass() == s1) {
1465 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1466 break;
1467 }
1468
1469 Temp src0 = get_alu_src(ctx, instr->src[0]);
1470 Temp src1 = get_alu_src(ctx, instr->src[1]);
1471 if (dst.regClass() == v1) {
1472 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1473 break;
1474 }
1475
1476 assert(src0.size() == 2 && src1.size() == 2);
1477 Temp src00 = bld.tmp(src0.type(), 1);
1478 Temp src01 = bld.tmp(dst.type(), 1);
1479 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1480 Temp src10 = bld.tmp(src1.type(), 1);
1481 Temp src11 = bld.tmp(dst.type(), 1);
1482 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1483
1484 if (dst.regClass() == s2) {
1485 Temp carry = bld.tmp(s1);
1486 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1487 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1488 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1489 } else if (dst.regClass() == v2) {
1490 Temp dst0 = bld.tmp(v1);
1491 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1492 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1493 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1494 } else {
1495 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1496 nir_print_instr(&instr->instr, stderr);
1497 fprintf(stderr, "\n");
1498 }
1499 break;
1500 }
1501 case nir_op_uadd_sat: {
1502 Temp src0 = get_alu_src(ctx, instr->src[0]);
1503 Temp src1 = get_alu_src(ctx, instr->src[1]);
1504 if (dst.regClass() == s1) {
1505 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1506 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1507 src0, src1);
1508 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1509 } else if (dst.regClass() == v1) {
1510 if (ctx->options->chip_class >= GFX9) {
1511 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1512 add->operands[0] = Operand(src0);
1513 add->operands[1] = Operand(src1);
1514 add->definitions[0] = Definition(dst);
1515 add->clamp = 1;
1516 ctx->block->instructions.emplace_back(std::move(add));
1517 } else {
1518 if (src1.regClass() != v1)
1519 std::swap(src0, src1);
1520 assert(src1.regClass() == v1);
1521 Temp tmp = bld.tmp(v1);
1522 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1523 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1524 }
1525 } else {
1526 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1527 nir_print_instr(&instr->instr, stderr);
1528 fprintf(stderr, "\n");
1529 }
1530 break;
1531 }
1532 case nir_op_uadd_carry: {
1533 Temp src0 = get_alu_src(ctx, instr->src[0]);
1534 Temp src1 = get_alu_src(ctx, instr->src[1]);
1535 if (dst.regClass() == s1) {
1536 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1537 break;
1538 }
1539 if (dst.regClass() == v1) {
1540 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1541 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1542 break;
1543 }
1544
1545 Temp src00 = bld.tmp(src0.type(), 1);
1546 Temp src01 = bld.tmp(dst.type(), 1);
1547 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1548 Temp src10 = bld.tmp(src1.type(), 1);
1549 Temp src11 = bld.tmp(dst.type(), 1);
1550 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1551 if (dst.regClass() == s2) {
1552 Temp carry = bld.tmp(s1);
1553 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1554 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1555 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1556 } else if (dst.regClass() == v2) {
1557 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1558 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1559 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1560 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1561 } else {
1562 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1563 nir_print_instr(&instr->instr, stderr);
1564 fprintf(stderr, "\n");
1565 }
1566 break;
1567 }
1568 case nir_op_isub: {
1569 if (dst.regClass() == s1) {
1570 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1571 break;
1572 }
1573
1574 Temp src0 = get_alu_src(ctx, instr->src[0]);
1575 Temp src1 = get_alu_src(ctx, instr->src[1]);
1576 if (dst.regClass() == v1) {
1577 bld.vsub32(Definition(dst), src0, src1);
1578 break;
1579 }
1580
1581 Temp src00 = bld.tmp(src0.type(), 1);
1582 Temp src01 = bld.tmp(dst.type(), 1);
1583 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1584 Temp src10 = bld.tmp(src1.type(), 1);
1585 Temp src11 = bld.tmp(dst.type(), 1);
1586 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1587 if (dst.regClass() == s2) {
1588 Temp carry = bld.tmp(s1);
1589 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1590 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1591 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1592 } else if (dst.regClass() == v2) {
1593 Temp lower = bld.tmp(v1);
1594 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1595 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1596 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1597 } else {
1598 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1599 nir_print_instr(&instr->instr, stderr);
1600 fprintf(stderr, "\n");
1601 }
1602 break;
1603 }
1604 case nir_op_usub_borrow: {
1605 Temp src0 = get_alu_src(ctx, instr->src[0]);
1606 Temp src1 = get_alu_src(ctx, instr->src[1]);
1607 if (dst.regClass() == s1) {
1608 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1609 break;
1610 } else if (dst.regClass() == v1) {
1611 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1612 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1613 break;
1614 }
1615
1616 Temp src00 = bld.tmp(src0.type(), 1);
1617 Temp src01 = bld.tmp(dst.type(), 1);
1618 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1619 Temp src10 = bld.tmp(src1.type(), 1);
1620 Temp src11 = bld.tmp(dst.type(), 1);
1621 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1622 if (dst.regClass() == s2) {
1623 Temp borrow = bld.tmp(s1);
1624 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1625 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1626 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1627 } else if (dst.regClass() == v2) {
1628 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1629 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1630 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1631 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1632 } else {
1633 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1634 nir_print_instr(&instr->instr, stderr);
1635 fprintf(stderr, "\n");
1636 }
1637 break;
1638 }
1639 case nir_op_imul: {
1640 if (dst.regClass() == v1) {
1641 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1642 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1643 } else if (dst.regClass() == s1) {
1644 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1645 } else {
1646 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1647 nir_print_instr(&instr->instr, stderr);
1648 fprintf(stderr, "\n");
1649 }
1650 break;
1651 }
1652 case nir_op_umul_high: {
1653 if (dst.regClass() == v1) {
1654 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1655 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1656 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1657 } else if (dst.regClass() == s1) {
1658 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1659 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1660 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1661 } else {
1662 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1663 nir_print_instr(&instr->instr, stderr);
1664 fprintf(stderr, "\n");
1665 }
1666 break;
1667 }
1668 case nir_op_imul_high: {
1669 if (dst.regClass() == v1) {
1670 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1671 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1672 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1673 } else if (dst.regClass() == s1) {
1674 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1675 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1676 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1677 } else {
1678 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1679 nir_print_instr(&instr->instr, stderr);
1680 fprintf(stderr, "\n");
1681 }
1682 break;
1683 }
1684 case nir_op_fmul: {
1685 Temp src0 = get_alu_src(ctx, instr->src[0]);
1686 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1687 if (dst.regClass() == v2b) {
1688 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1689 } else if (dst.regClass() == v1) {
1690 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1691 } else if (dst.regClass() == v2) {
1692 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1693 } else {
1694 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1695 nir_print_instr(&instr->instr, stderr);
1696 fprintf(stderr, "\n");
1697 }
1698 break;
1699 }
1700 case nir_op_fadd: {
1701 Temp src0 = get_alu_src(ctx, instr->src[0]);
1702 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1703 if (dst.regClass() == v2b) {
1704 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1705 } else if (dst.regClass() == v1) {
1706 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1707 } else if (dst.regClass() == v2) {
1708 bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1709 } else {
1710 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1711 nir_print_instr(&instr->instr, stderr);
1712 fprintf(stderr, "\n");
1713 }
1714 break;
1715 }
1716 case nir_op_fsub: {
1717 Temp src0 = get_alu_src(ctx, instr->src[0]);
1718 Temp src1 = get_alu_src(ctx, instr->src[1]);
1719 if (dst.regClass() == v2b) {
1720 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1721 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1722 else
1723 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1724 } else if (dst.regClass() == v1) {
1725 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1726 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1727 else
1728 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1729 } else if (dst.regClass() == v2) {
1730 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1731 as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1732 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1733 sub->neg[1] = true;
1734 } else {
1735 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1736 nir_print_instr(&instr->instr, stderr);
1737 fprintf(stderr, "\n");
1738 }
1739 break;
1740 }
1741 case nir_op_fmax: {
1742 Temp src0 = get_alu_src(ctx, instr->src[0]);
1743 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1744 if (dst.regClass() == v2b) {
1745 // TODO: check fp_mode.must_flush_denorms16_64
1746 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1747 } else if (dst.regClass() == v1) {
1748 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1749 } else if (dst.regClass() == v2) {
1750 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1751 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1752 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1753 } else {
1754 bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1755 }
1756 } else {
1757 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1758 nir_print_instr(&instr->instr, stderr);
1759 fprintf(stderr, "\n");
1760 }
1761 break;
1762 }
1763 case nir_op_fmin: {
1764 Temp src0 = get_alu_src(ctx, instr->src[0]);
1765 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1766 if (dst.regClass() == v2b) {
1767 // TODO: check fp_mode.must_flush_denorms16_64
1768 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1769 } else if (dst.regClass() == v1) {
1770 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1771 } else if (dst.regClass() == v2) {
1772 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1773 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1774 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1775 } else {
1776 bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1777 }
1778 } else {
1779 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1780 nir_print_instr(&instr->instr, stderr);
1781 fprintf(stderr, "\n");
1782 }
1783 break;
1784 }
1785 case nir_op_fmax3: {
1786 if (dst.regClass() == v2b) {
1787 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1788 } else if (dst.regClass() == v1) {
1789 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1790 } else {
1791 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1792 nir_print_instr(&instr->instr, stderr);
1793 fprintf(stderr, "\n");
1794 }
1795 break;
1796 }
1797 case nir_op_fmin3: {
1798 if (dst.regClass() == v2b) {
1799 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1800 } else if (dst.regClass() == v1) {
1801 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1802 } else {
1803 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1804 nir_print_instr(&instr->instr, stderr);
1805 fprintf(stderr, "\n");
1806 }
1807 break;
1808 }
1809 case nir_op_fmed3: {
1810 if (dst.regClass() == v2b) {
1811 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1812 } else if (dst.regClass() == v1) {
1813 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1814 } else {
1815 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1816 nir_print_instr(&instr->instr, stderr);
1817 fprintf(stderr, "\n");
1818 }
1819 break;
1820 }
1821 case nir_op_umax3: {
1822 if (dst.size() == 1) {
1823 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1824 } else {
1825 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1826 nir_print_instr(&instr->instr, stderr);
1827 fprintf(stderr, "\n");
1828 }
1829 break;
1830 }
1831 case nir_op_umin3: {
1832 if (dst.size() == 1) {
1833 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1834 } else {
1835 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1836 nir_print_instr(&instr->instr, stderr);
1837 fprintf(stderr, "\n");
1838 }
1839 break;
1840 }
1841 case nir_op_umed3: {
1842 if (dst.size() == 1) {
1843 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1844 } else {
1845 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1846 nir_print_instr(&instr->instr, stderr);
1847 fprintf(stderr, "\n");
1848 }
1849 break;
1850 }
1851 case nir_op_imax3: {
1852 if (dst.size() == 1) {
1853 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1854 } else {
1855 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1856 nir_print_instr(&instr->instr, stderr);
1857 fprintf(stderr, "\n");
1858 }
1859 break;
1860 }
1861 case nir_op_imin3: {
1862 if (dst.size() == 1) {
1863 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1864 } else {
1865 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1866 nir_print_instr(&instr->instr, stderr);
1867 fprintf(stderr, "\n");
1868 }
1869 break;
1870 }
1871 case nir_op_imed3: {
1872 if (dst.size() == 1) {
1873 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1874 } else {
1875 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1876 nir_print_instr(&instr->instr, stderr);
1877 fprintf(stderr, "\n");
1878 }
1879 break;
1880 }
1881 case nir_op_cube_face_coord: {
1882 Temp in = get_alu_src(ctx, instr->src[0], 3);
1883 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1884 emit_extract_vector(ctx, in, 1, v1),
1885 emit_extract_vector(ctx, in, 2, v1) };
1886 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1887 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1888 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1889 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1890 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1891 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1892 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1893 break;
1894 }
1895 case nir_op_cube_face_index: {
1896 Temp in = get_alu_src(ctx, instr->src[0], 3);
1897 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1898 emit_extract_vector(ctx, in, 1, v1),
1899 emit_extract_vector(ctx, in, 2, v1) };
1900 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1901 break;
1902 }
1903 case nir_op_bcsel: {
1904 emit_bcsel(ctx, instr, dst);
1905 break;
1906 }
1907 case nir_op_frsq: {
1908 Temp src = get_alu_src(ctx, instr->src[0]);
1909 if (dst.regClass() == v2b) {
1910 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1911 } else if (dst.regClass() == v1) {
1912 emit_rsq(ctx, bld, Definition(dst), src);
1913 } else if (dst.regClass() == v2) {
1914 /* Lowered at NIR level for precision reasons. */
1915 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1916 } else {
1917 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1918 nir_print_instr(&instr->instr, stderr);
1919 fprintf(stderr, "\n");
1920 }
1921 break;
1922 }
1923 case nir_op_fneg: {
1924 Temp src = get_alu_src(ctx, instr->src[0]);
1925 if (dst.regClass() == v2b) {
1926 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1927 } else if (dst.regClass() == v1) {
1928 if (ctx->block->fp_mode.must_flush_denorms32)
1929 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1930 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1931 } else if (dst.regClass() == v2) {
1932 if (ctx->block->fp_mode.must_flush_denorms16_64)
1933 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1934 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1935 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1936 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1937 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1938 } else {
1939 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1940 nir_print_instr(&instr->instr, stderr);
1941 fprintf(stderr, "\n");
1942 }
1943 break;
1944 }
1945 case nir_op_fabs: {
1946 Temp src = get_alu_src(ctx, instr->src[0]);
1947 if (dst.regClass() == v2b) {
1948 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1949 } else if (dst.regClass() == v1) {
1950 if (ctx->block->fp_mode.must_flush_denorms32)
1951 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1952 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1953 } else if (dst.regClass() == v2) {
1954 if (ctx->block->fp_mode.must_flush_denorms16_64)
1955 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1956 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1957 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1958 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1959 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1960 } else {
1961 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1962 nir_print_instr(&instr->instr, stderr);
1963 fprintf(stderr, "\n");
1964 }
1965 break;
1966 }
1967 case nir_op_fsat: {
1968 Temp src = get_alu_src(ctx, instr->src[0]);
1969 if (dst.regClass() == v2b) {
1970 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
1971 } else if (dst.regClass() == v1) {
1972 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1973 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1974 // TODO: confirm that this holds under any circumstances
1975 } else if (dst.regClass() == v2) {
1976 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1977 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1978 vop3->clamp = true;
1979 } else {
1980 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1981 nir_print_instr(&instr->instr, stderr);
1982 fprintf(stderr, "\n");
1983 }
1984 break;
1985 }
1986 case nir_op_flog2: {
1987 Temp src = get_alu_src(ctx, instr->src[0]);
1988 if (dst.regClass() == v2b) {
1989 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
1990 } else if (dst.regClass() == v1) {
1991 emit_log2(ctx, bld, Definition(dst), src);
1992 } else {
1993 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1994 nir_print_instr(&instr->instr, stderr);
1995 fprintf(stderr, "\n");
1996 }
1997 break;
1998 }
1999 case nir_op_frcp: {
2000 Temp src = get_alu_src(ctx, instr->src[0]);
2001 if (dst.regClass() == v2b) {
2002 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2003 } else if (dst.regClass() == v1) {
2004 emit_rcp(ctx, bld, Definition(dst), src);
2005 } else if (dst.regClass() == v2) {
2006 /* Lowered at NIR level for precision reasons. */
2007 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2008 } else {
2009 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2010 nir_print_instr(&instr->instr, stderr);
2011 fprintf(stderr, "\n");
2012 }
2013 break;
2014 }
2015 case nir_op_fexp2: {
2016 if (dst.regClass() == v2b) {
2017 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2018 } else if (dst.regClass() == v1) {
2019 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2020 } else {
2021 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2022 nir_print_instr(&instr->instr, stderr);
2023 fprintf(stderr, "\n");
2024 }
2025 break;
2026 }
2027 case nir_op_fsqrt: {
2028 Temp src = get_alu_src(ctx, instr->src[0]);
2029 if (dst.regClass() == v2b) {
2030 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2031 } else if (dst.regClass() == v1) {
2032 emit_sqrt(ctx, bld, Definition(dst), src);
2033 } else if (dst.regClass() == v2) {
2034 /* Lowered at NIR level for precision reasons. */
2035 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2036 } else {
2037 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2038 nir_print_instr(&instr->instr, stderr);
2039 fprintf(stderr, "\n");
2040 }
2041 break;
2042 }
2043 case nir_op_ffract: {
2044 if (dst.regClass() == v2b) {
2045 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2046 } else if (dst.regClass() == v1) {
2047 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2048 } else if (dst.regClass() == v2) {
2049 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2050 } else {
2051 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2052 nir_print_instr(&instr->instr, stderr);
2053 fprintf(stderr, "\n");
2054 }
2055 break;
2056 }
2057 case nir_op_ffloor: {
2058 Temp src = get_alu_src(ctx, instr->src[0]);
2059 if (dst.regClass() == v2b) {
2060 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2061 } else if (dst.regClass() == v1) {
2062 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2063 } else if (dst.regClass() == v2) {
2064 emit_floor_f64(ctx, bld, Definition(dst), src);
2065 } else {
2066 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2067 nir_print_instr(&instr->instr, stderr);
2068 fprintf(stderr, "\n");
2069 }
2070 break;
2071 }
2072 case nir_op_fceil: {
2073 Temp src0 = get_alu_src(ctx, instr->src[0]);
2074 if (dst.regClass() == v2b) {
2075 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2076 } else if (dst.regClass() == v1) {
2077 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2078 } else if (dst.regClass() == v2) {
2079 if (ctx->options->chip_class >= GFX7) {
2080 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2081 } else {
2082 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2083 /* trunc = trunc(src0)
2084 * if (src0 > 0.0 && src0 != trunc)
2085 * trunc += 1.0
2086 */
2087 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2088 Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2089 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2090 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2091 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2092 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2093 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2094 }
2095 } else {
2096 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2097 nir_print_instr(&instr->instr, stderr);
2098 fprintf(stderr, "\n");
2099 }
2100 break;
2101 }
2102 case nir_op_ftrunc: {
2103 Temp src = get_alu_src(ctx, instr->src[0]);
2104 if (dst.regClass() == v2b) {
2105 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2106 } else if (dst.regClass() == v1) {
2107 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2108 } else if (dst.regClass() == v2) {
2109 emit_trunc_f64(ctx, bld, Definition(dst), src);
2110 } else {
2111 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2112 nir_print_instr(&instr->instr, stderr);
2113 fprintf(stderr, "\n");
2114 }
2115 break;
2116 }
2117 case nir_op_fround_even: {
2118 Temp src0 = get_alu_src(ctx, instr->src[0]);
2119 if (dst.regClass() == v2b) {
2120 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2121 } else if (dst.regClass() == v1) {
2122 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2123 } else if (dst.regClass() == v2) {
2124 if (ctx->options->chip_class >= GFX7) {
2125 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2126 } else {
2127 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2128 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2129 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2130
2131 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2132 Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2133 Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2134 Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2135 static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2136 tmp = sub->definitions[0].getTemp();
2137
2138 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2139 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2140 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2141 Temp cond = vop3->definitions[0].getTemp();
2142
2143 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2144 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2145 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2146 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2147
2148 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2149 }
2150 } else {
2151 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2152 nir_print_instr(&instr->instr, stderr);
2153 fprintf(stderr, "\n");
2154 }
2155 break;
2156 }
2157 case nir_op_fsin:
2158 case nir_op_fcos: {
2159 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2160 aco_ptr<Instruction> norm;
2161 if (dst.regClass() == v2b) {
2162 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2163 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2164 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2165 bld.vop1(opcode, Definition(dst), tmp);
2166 } else if (dst.regClass() == v1) {
2167 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2168 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2169
2170 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2171 if (ctx->options->chip_class < GFX9)
2172 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2173
2174 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2175 bld.vop1(opcode, Definition(dst), tmp);
2176 } else {
2177 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2178 nir_print_instr(&instr->instr, stderr);
2179 fprintf(stderr, "\n");
2180 }
2181 break;
2182 }
2183 case nir_op_ldexp: {
2184 Temp src0 = get_alu_src(ctx, instr->src[0]);
2185 Temp src1 = get_alu_src(ctx, instr->src[1]);
2186 if (dst.regClass() == v2b) {
2187 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2188 } else if (dst.regClass() == v1) {
2189 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2190 } else if (dst.regClass() == v2) {
2191 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2192 } else {
2193 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2194 nir_print_instr(&instr->instr, stderr);
2195 fprintf(stderr, "\n");
2196 }
2197 break;
2198 }
2199 case nir_op_frexp_sig: {
2200 Temp src = get_alu_src(ctx, instr->src[0]);
2201 if (dst.regClass() == v2b) {
2202 bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2203 } else if (dst.regClass() == v1) {
2204 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2205 } else if (dst.regClass() == v2) {
2206 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2207 } else {
2208 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2209 nir_print_instr(&instr->instr, stderr);
2210 fprintf(stderr, "\n");
2211 }
2212 break;
2213 }
2214 case nir_op_frexp_exp: {
2215 Temp src = get_alu_src(ctx, instr->src[0]);
2216 if (instr->src[0].src.ssa->bit_size == 16) {
2217 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2218 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2219 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2220 } else if (instr->src[0].src.ssa->bit_size == 32) {
2221 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2222 } else if (instr->src[0].src.ssa->bit_size == 64) {
2223 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2224 } else {
2225 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2226 nir_print_instr(&instr->instr, stderr);
2227 fprintf(stderr, "\n");
2228 }
2229 break;
2230 }
2231 case nir_op_fsign: {
2232 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2233 if (dst.regClass() == v2b) {
2234 Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2235 Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2236 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2237 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2238 cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2239 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2240 } else if (dst.regClass() == v1) {
2241 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2242 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2243 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2244 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2245 } else if (dst.regClass() == v2) {
2246 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2247 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2248 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2249
2250 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2251 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2252 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2253
2254 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2255 } else {
2256 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2257 nir_print_instr(&instr->instr, stderr);
2258 fprintf(stderr, "\n");
2259 }
2260 break;
2261 }
2262 case nir_op_f2f16:
2263 case nir_op_f2f16_rtne: {
2264 Temp src = get_alu_src(ctx, instr->src[0]);
2265 if (instr->src[0].src.ssa->bit_size == 64)
2266 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2267 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2268 break;
2269 }
2270 case nir_op_f2f16_rtz: {
2271 Temp src = get_alu_src(ctx, instr->src[0]);
2272 if (instr->src[0].src.ssa->bit_size == 64)
2273 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2274 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2275 break;
2276 }
2277 case nir_op_f2f32: {
2278 if (instr->src[0].src.ssa->bit_size == 16) {
2279 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2280 } else if (instr->src[0].src.ssa->bit_size == 64) {
2281 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2282 } else {
2283 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2284 nir_print_instr(&instr->instr, stderr);
2285 fprintf(stderr, "\n");
2286 }
2287 break;
2288 }
2289 case nir_op_f2f64: {
2290 Temp src = get_alu_src(ctx, instr->src[0]);
2291 if (instr->src[0].src.ssa->bit_size == 16)
2292 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2293 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2294 break;
2295 }
2296 case nir_op_i2f16: {
2297 assert(dst.regClass() == v2b);
2298 Temp src = get_alu_src(ctx, instr->src[0]);
2299 if (instr->src[0].src.ssa->bit_size == 8)
2300 src = convert_int(ctx, bld, src, 8, 16, true);
2301 else if (instr->src[0].src.ssa->bit_size == 64)
2302 src = convert_int(ctx, bld, src, 64, 32, false);
2303 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2304 break;
2305 }
2306 case nir_op_i2f32: {
2307 assert(dst.size() == 1);
2308 Temp src = get_alu_src(ctx, instr->src[0]);
2309 if (instr->src[0].src.ssa->bit_size <= 16)
2310 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2311 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2312 break;
2313 }
2314 case nir_op_i2f64: {
2315 if (instr->src[0].src.ssa->bit_size <= 32) {
2316 Temp src = get_alu_src(ctx, instr->src[0]);
2317 if (instr->src[0].src.ssa->bit_size <= 16)
2318 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2319 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2320 } else if (instr->src[0].src.ssa->bit_size == 64) {
2321 Temp src = get_alu_src(ctx, instr->src[0]);
2322 RegClass rc = RegClass(src.type(), 1);
2323 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2324 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2325 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2326 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2327 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2328 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2329
2330 } else {
2331 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2332 nir_print_instr(&instr->instr, stderr);
2333 fprintf(stderr, "\n");
2334 }
2335 break;
2336 }
2337 case nir_op_u2f16: {
2338 assert(dst.regClass() == v2b);
2339 Temp src = get_alu_src(ctx, instr->src[0]);
2340 if (instr->src[0].src.ssa->bit_size == 8)
2341 src = convert_int(ctx, bld, src, 8, 16, false);
2342 else if (instr->src[0].src.ssa->bit_size == 64)
2343 src = convert_int(ctx, bld, src, 64, 32, false);
2344 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2345 break;
2346 }
2347 case nir_op_u2f32: {
2348 assert(dst.size() == 1);
2349 Temp src = get_alu_src(ctx, instr->src[0]);
2350 if (instr->src[0].src.ssa->bit_size == 8) {
2351 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2352 } else {
2353 if (instr->src[0].src.ssa->bit_size == 16)
2354 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2355 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2356 }
2357 break;
2358 }
2359 case nir_op_u2f64: {
2360 if (instr->src[0].src.ssa->bit_size <= 32) {
2361 Temp src = get_alu_src(ctx, instr->src[0]);
2362 if (instr->src[0].src.ssa->bit_size <= 16)
2363 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2364 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2365 } else if (instr->src[0].src.ssa->bit_size == 64) {
2366 Temp src = get_alu_src(ctx, instr->src[0]);
2367 RegClass rc = RegClass(src.type(), 1);
2368 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2369 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2370 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2371 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2372 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2373 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2374 } else {
2375 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2376 nir_print_instr(&instr->instr, stderr);
2377 fprintf(stderr, "\n");
2378 }
2379 break;
2380 }
2381 case nir_op_f2i8:
2382 case nir_op_f2i16: {
2383 if (instr->src[0].src.ssa->bit_size == 16)
2384 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2385 else if (instr->src[0].src.ssa->bit_size == 32)
2386 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2387 else
2388 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2389 break;
2390 }
2391 case nir_op_f2u8:
2392 case nir_op_f2u16: {
2393 if (instr->src[0].src.ssa->bit_size == 16)
2394 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2395 else if (instr->src[0].src.ssa->bit_size == 32)
2396 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2397 else
2398 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2399 break;
2400 }
2401 case nir_op_f2i32: {
2402 Temp src = get_alu_src(ctx, instr->src[0]);
2403 if (instr->src[0].src.ssa->bit_size == 16) {
2404 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2405 if (dst.type() == RegType::vgpr) {
2406 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2407 } else {
2408 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2409 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2410 }
2411 } else if (instr->src[0].src.ssa->bit_size == 32) {
2412 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2413 } else if (instr->src[0].src.ssa->bit_size == 64) {
2414 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2415 } else {
2416 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2417 nir_print_instr(&instr->instr, stderr);
2418 fprintf(stderr, "\n");
2419 }
2420 break;
2421 }
2422 case nir_op_f2u32: {
2423 Temp src = get_alu_src(ctx, instr->src[0]);
2424 if (instr->src[0].src.ssa->bit_size == 16) {
2425 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2426 if (dst.type() == RegType::vgpr) {
2427 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2428 } else {
2429 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2430 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2431 }
2432 } else if (instr->src[0].src.ssa->bit_size == 32) {
2433 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2434 } else if (instr->src[0].src.ssa->bit_size == 64) {
2435 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2436 } else {
2437 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2438 nir_print_instr(&instr->instr, stderr);
2439 fprintf(stderr, "\n");
2440 }
2441 break;
2442 }
2443 case nir_op_f2i64: {
2444 Temp src = get_alu_src(ctx, instr->src[0]);
2445 if (instr->src[0].src.ssa->bit_size == 16)
2446 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2447
2448 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2449 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2450 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2451 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2452 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2453 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2454 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2455 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2456 Temp new_exponent = bld.tmp(v1);
2457 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2458 if (ctx->program->chip_class >= GFX8)
2459 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2460 else
2461 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2462 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2463 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2464 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2465 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2466 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2467 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2468 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2469 Temp new_lower = bld.tmp(v1);
2470 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2471 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2472 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2473
2474 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2475 if (src.type() == RegType::vgpr)
2476 src = bld.as_uniform(src);
2477 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2478 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2479 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2480 exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2481 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2482 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2483 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2484 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2485 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2486 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2487 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2488 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2489 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2490 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2491 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2492 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2493 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2494 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2495 Temp borrow = bld.tmp(s1);
2496 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2497 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2498 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2499
2500 } else if (instr->src[0].src.ssa->bit_size == 64) {
2501 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2502 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2503 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2504 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2505 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2506 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2507 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2508 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2509 if (dst.type() == RegType::sgpr) {
2510 lower = bld.as_uniform(lower);
2511 upper = bld.as_uniform(upper);
2512 }
2513 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2514
2515 } else {
2516 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2517 nir_print_instr(&instr->instr, stderr);
2518 fprintf(stderr, "\n");
2519 }
2520 break;
2521 }
2522 case nir_op_f2u64: {
2523 Temp src = get_alu_src(ctx, instr->src[0]);
2524 if (instr->src[0].src.ssa->bit_size == 16)
2525 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2526
2527 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2528 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2529 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2530 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2531 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2532 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2533 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2534 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2535 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2536 Temp new_exponent = bld.tmp(v1);
2537 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2538 if (ctx->program->chip_class >= GFX8)
2539 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2540 else
2541 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2542 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2543 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2544 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2545 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2546 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2547 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2548 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2549
2550 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2551 if (src.type() == RegType::vgpr)
2552 src = bld.as_uniform(src);
2553 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2554 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2555 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2556 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2557 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2558 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2559 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2560 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2561 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2562 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2563 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2564 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2565 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2566 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2567 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2568 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2569 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2570 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2571
2572 } else if (instr->src[0].src.ssa->bit_size == 64) {
2573 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2574 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2575 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2576 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2577 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2578 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2579 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2580 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2581 if (dst.type() == RegType::sgpr) {
2582 lower = bld.as_uniform(lower);
2583 upper = bld.as_uniform(upper);
2584 }
2585 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2586
2587 } else {
2588 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2589 nir_print_instr(&instr->instr, stderr);
2590 fprintf(stderr, "\n");
2591 }
2592 break;
2593 }
2594 case nir_op_b2f16: {
2595 Temp src = get_alu_src(ctx, instr->src[0]);
2596 assert(src.regClass() == bld.lm);
2597
2598 if (dst.regClass() == s1) {
2599 src = bool_to_scalar_condition(ctx, src);
2600 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
2601 } else if (dst.regClass() == v2b) {
2602 Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2603 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
2604 } else {
2605 unreachable("Wrong destination register class for nir_op_b2f16.");
2606 }
2607 break;
2608 }
2609 case nir_op_b2f32: {
2610 Temp src = get_alu_src(ctx, instr->src[0]);
2611 assert(src.regClass() == bld.lm);
2612
2613 if (dst.regClass() == s1) {
2614 src = bool_to_scalar_condition(ctx, src);
2615 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2616 } else if (dst.regClass() == v1) {
2617 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2618 } else {
2619 unreachable("Wrong destination register class for nir_op_b2f32.");
2620 }
2621 break;
2622 }
2623 case nir_op_b2f64: {
2624 Temp src = get_alu_src(ctx, instr->src[0]);
2625 assert(src.regClass() == bld.lm);
2626
2627 if (dst.regClass() == s2) {
2628 src = bool_to_scalar_condition(ctx, src);
2629 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2630 } else if (dst.regClass() == v2) {
2631 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2632 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2633 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2634 } else {
2635 unreachable("Wrong destination register class for nir_op_b2f64.");
2636 }
2637 break;
2638 }
2639 case nir_op_i2i8:
2640 case nir_op_i2i16:
2641 case nir_op_i2i32:
2642 case nir_op_i2i64: {
2643 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2644 instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
2645 break;
2646 }
2647 case nir_op_u2u8:
2648 case nir_op_u2u16:
2649 case nir_op_u2u32:
2650 case nir_op_u2u64: {
2651 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2652 instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
2653 break;
2654 }
2655 case nir_op_b2b32:
2656 case nir_op_b2i32: {
2657 Temp src = get_alu_src(ctx, instr->src[0]);
2658 assert(src.regClass() == bld.lm);
2659
2660 if (dst.regClass() == s1) {
2661 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2662 bool_to_scalar_condition(ctx, src, dst);
2663 } else if (dst.regClass() == v1) {
2664 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2665 } else {
2666 unreachable("Invalid register class for b2i32");
2667 }
2668 break;
2669 }
2670 case nir_op_b2b1:
2671 case nir_op_i2b1: {
2672 Temp src = get_alu_src(ctx, instr->src[0]);
2673 assert(dst.regClass() == bld.lm);
2674
2675 if (src.type() == RegType::vgpr) {
2676 assert(src.regClass() == v1 || src.regClass() == v2);
2677 assert(dst.regClass() == bld.lm);
2678 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2679 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2680 } else {
2681 assert(src.regClass() == s1 || src.regClass() == s2);
2682 Temp tmp;
2683 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2684 tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2685 } else {
2686 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2687 bld.scc(bld.def(s1)), Operand(0u), src);
2688 }
2689 bool_to_vector_condition(ctx, tmp, dst);
2690 }
2691 break;
2692 }
2693 case nir_op_pack_64_2x32_split: {
2694 Temp src0 = get_alu_src(ctx, instr->src[0]);
2695 Temp src1 = get_alu_src(ctx, instr->src[1]);
2696
2697 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2698 break;
2699 }
2700 case nir_op_unpack_64_2x32_split_x:
2701 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2702 break;
2703 case nir_op_unpack_64_2x32_split_y:
2704 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2705 break;
2706 case nir_op_unpack_32_2x16_split_x:
2707 if (dst.type() == RegType::vgpr) {
2708 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2709 } else {
2710 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
2711 }
2712 break;
2713 case nir_op_unpack_32_2x16_split_y:
2714 if (dst.type() == RegType::vgpr) {
2715 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2716 } else {
2717 bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
2718 }
2719 break;
2720 case nir_op_pack_32_2x16_split: {
2721 Temp src0 = get_alu_src(ctx, instr->src[0]);
2722 Temp src1 = get_alu_src(ctx, instr->src[1]);
2723 if (dst.regClass() == v1) {
2724 src0 = emit_extract_vector(ctx, src0, 0, v2b);
2725 src1 = emit_extract_vector(ctx, src1, 0, v2b);
2726 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2727 } else {
2728 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
2729 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
2730 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
2731 }
2732 break;
2733 }
2734 case nir_op_pack_half_2x16: {
2735 Temp src = get_alu_src(ctx, instr->src[0], 2);
2736
2737 if (dst.regClass() == v1) {
2738 Temp src0 = bld.tmp(v1);
2739 Temp src1 = bld.tmp(v1);
2740 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2741 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2742 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2743 else
2744 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2745 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2746 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2747 } else {
2748 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2749 nir_print_instr(&instr->instr, stderr);
2750 fprintf(stderr, "\n");
2751 }
2752 break;
2753 }
2754 case nir_op_unpack_half_2x16_split_x: {
2755 if (dst.regClass() == v1) {
2756 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2757 } else {
2758 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2759 nir_print_instr(&instr->instr, stderr);
2760 fprintf(stderr, "\n");
2761 }
2762 break;
2763 }
2764 case nir_op_unpack_half_2x16_split_y: {
2765 if (dst.regClass() == v1) {
2766 /* TODO: use SDWA here */
2767 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2768 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2769 } else {
2770 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2771 nir_print_instr(&instr->instr, stderr);
2772 fprintf(stderr, "\n");
2773 }
2774 break;
2775 }
2776 case nir_op_fquantize2f16: {
2777 Temp src = get_alu_src(ctx, instr->src[0]);
2778 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2779 Temp f32, cmp_res;
2780
2781 if (ctx->program->chip_class >= GFX8) {
2782 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2783 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2784 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2785 } else {
2786 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2787 * so compare the result and flush to 0 if it's smaller.
2788 */
2789 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2790 Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2791 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2792 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2793 cmp_res = vop3->definitions[0].getTemp();
2794 }
2795
2796 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2797 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2798 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2799 } else {
2800 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2801 }
2802 break;
2803 }
2804 case nir_op_bfm: {
2805 Temp bits = get_alu_src(ctx, instr->src[0]);
2806 Temp offset = get_alu_src(ctx, instr->src[1]);
2807
2808 if (dst.regClass() == s1) {
2809 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2810 } else if (dst.regClass() == v1) {
2811 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2812 } else {
2813 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2814 nir_print_instr(&instr->instr, stderr);
2815 fprintf(stderr, "\n");
2816 }
2817 break;
2818 }
2819 case nir_op_bitfield_select: {
2820 /* (mask & insert) | (~mask & base) */
2821 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2822 Temp insert = get_alu_src(ctx, instr->src[1]);
2823 Temp base = get_alu_src(ctx, instr->src[2]);
2824
2825 /* dst = (insert & bitmask) | (base & ~bitmask) */
2826 if (dst.regClass() == s1) {
2827 aco_ptr<Instruction> sop2;
2828 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2829 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2830 Operand lhs;
2831 if (const_insert && const_bitmask) {
2832 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2833 } else {
2834 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2835 lhs = Operand(insert);
2836 }
2837
2838 Operand rhs;
2839 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2840 if (const_base && const_bitmask) {
2841 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2842 } else {
2843 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2844 rhs = Operand(base);
2845 }
2846
2847 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2848
2849 } else if (dst.regClass() == v1) {
2850 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2851 base = as_vgpr(ctx, base);
2852 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2853 insert = as_vgpr(ctx, insert);
2854
2855 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2856
2857 } else {
2858 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2859 nir_print_instr(&instr->instr, stderr);
2860 fprintf(stderr, "\n");
2861 }
2862 break;
2863 }
2864 case nir_op_ubfe:
2865 case nir_op_ibfe: {
2866 Temp base = get_alu_src(ctx, instr->src[0]);
2867 Temp offset = get_alu_src(ctx, instr->src[1]);
2868 Temp bits = get_alu_src(ctx, instr->src[2]);
2869
2870 if (dst.type() == RegType::sgpr) {
2871 Operand extract;
2872 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2873 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2874 if (const_offset && const_bits) {
2875 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2876 extract = Operand(const_extract);
2877 } else {
2878 Operand width;
2879 if (const_bits) {
2880 width = Operand(const_bits->u32 << 16);
2881 } else {
2882 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2883 }
2884 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2885 }
2886
2887 aco_opcode opcode;
2888 if (dst.regClass() == s1) {
2889 if (instr->op == nir_op_ubfe)
2890 opcode = aco_opcode::s_bfe_u32;
2891 else
2892 opcode = aco_opcode::s_bfe_i32;
2893 } else if (dst.regClass() == s2) {
2894 if (instr->op == nir_op_ubfe)
2895 opcode = aco_opcode::s_bfe_u64;
2896 else
2897 opcode = aco_opcode::s_bfe_i64;
2898 } else {
2899 unreachable("Unsupported BFE bit size");
2900 }
2901
2902 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2903
2904 } else {
2905 aco_opcode opcode;
2906 if (dst.regClass() == v1) {
2907 if (instr->op == nir_op_ubfe)
2908 opcode = aco_opcode::v_bfe_u32;
2909 else
2910 opcode = aco_opcode::v_bfe_i32;
2911 } else {
2912 unreachable("Unsupported BFE bit size");
2913 }
2914
2915 emit_vop3a_instruction(ctx, instr, opcode, dst);
2916 }
2917 break;
2918 }
2919 case nir_op_bit_count: {
2920 Temp src = get_alu_src(ctx, instr->src[0]);
2921 if (src.regClass() == s1) {
2922 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2923 } else if (src.regClass() == v1) {
2924 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2925 } else if (src.regClass() == v2) {
2926 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2927 emit_extract_vector(ctx, src, 1, v1),
2928 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2929 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2930 } else if (src.regClass() == s2) {
2931 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2932 } else {
2933 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2934 nir_print_instr(&instr->instr, stderr);
2935 fprintf(stderr, "\n");
2936 }
2937 break;
2938 }
2939 case nir_op_flt: {
2940 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2941 break;
2942 }
2943 case nir_op_fge: {
2944 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2945 break;
2946 }
2947 case nir_op_feq: {
2948 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2949 break;
2950 }
2951 case nir_op_fne: {
2952 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2953 break;
2954 }
2955 case nir_op_ilt: {
2956 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2957 break;
2958 }
2959 case nir_op_ige: {
2960 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2961 break;
2962 }
2963 case nir_op_ieq: {
2964 if (instr->src[0].src.ssa->bit_size == 1)
2965 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
2966 else
2967 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
2968 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
2969 break;
2970 }
2971 case nir_op_ine: {
2972 if (instr->src[0].src.ssa->bit_size == 1)
2973 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
2974 else
2975 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
2976 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
2977 break;
2978 }
2979 case nir_op_ult: {
2980 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2981 break;
2982 }
2983 case nir_op_uge: {
2984 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2985 break;
2986 }
2987 case nir_op_fddx:
2988 case nir_op_fddy:
2989 case nir_op_fddx_fine:
2990 case nir_op_fddy_fine:
2991 case nir_op_fddx_coarse:
2992 case nir_op_fddy_coarse: {
2993 Temp src = get_alu_src(ctx, instr->src[0]);
2994 uint16_t dpp_ctrl1, dpp_ctrl2;
2995 if (instr->op == nir_op_fddx_fine) {
2996 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
2997 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
2998 } else if (instr->op == nir_op_fddy_fine) {
2999 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3000 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3001 } else {
3002 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3003 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3004 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3005 else
3006 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3007 }
3008
3009 Temp tmp;
3010 if (ctx->program->chip_class >= GFX8) {
3011 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3012 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3013 } else {
3014 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3015 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3016 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3017 }
3018 emit_wqm(ctx, tmp, dst, true);
3019 break;
3020 }
3021 default:
3022 fprintf(stderr, "Unknown NIR ALU instr: ");
3023 nir_print_instr(&instr->instr, stderr);
3024 fprintf(stderr, "\n");
3025 }
3026 }
3027
3028 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
3029 {
3030 Temp dst = get_ssa_temp(ctx, &instr->def);
3031
3032 // TODO: we really want to have the resulting type as this would allow for 64bit literals
3033 // which get truncated the lsb if double and msb if int
3034 // for now, we only use s_mov_b64 with 64bit inline constants
3035 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3036 assert(dst.type() == RegType::sgpr);
3037
3038 Builder bld(ctx->program, ctx->block);
3039
3040 if (instr->def.bit_size == 1) {
3041 assert(dst.regClass() == bld.lm);
3042 int val = instr->value[0].b ? -1 : 0;
3043 Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
3044 bld.sop1(Builder::s_mov, Definition(dst), op);
3045 } else if (instr->def.bit_size == 8) {
3046 /* ensure that the value is correctly represented in the low byte of the register */
3047 bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
3048 } else if (instr->def.bit_size == 16) {
3049 /* ensure that the value is correctly represented in the low half of the register */
3050 bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
3051 } else if (dst.size() == 1) {
3052 bld.copy(Definition(dst), Operand(instr->value[0].u32));
3053 } else {
3054 assert(dst.size() != 1);
3055 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3056 if (instr->def.bit_size == 64)
3057 for (unsigned i = 0; i < dst.size(); i++)
3058 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
3059 else {
3060 for (unsigned i = 0; i < dst.size(); i++)
3061 vec->operands[i] = Operand{instr->value[i].u32};
3062 }
3063 vec->definitions[0] = Definition(dst);
3064 ctx->block->instructions.emplace_back(std::move(vec));
3065 }
3066 }
3067
3068 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
3069 {
3070 uint32_t new_mask = 0;
3071 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3072 if (mask & (1u << i))
3073 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3074 return new_mask;
3075 }
3076
3077 struct LoadEmitInfo {
3078 Operand offset;
3079 Temp dst;
3080 unsigned num_components;
3081 unsigned component_size;
3082 Temp resource = Temp(0, s1);
3083 unsigned component_stride = 0;
3084 unsigned const_offset = 0;
3085 unsigned align_mul = 0;
3086 unsigned align_offset = 0;
3087
3088 bool glc = false;
3089 unsigned swizzle_component_size = 0;
3090 barrier_interaction barrier = barrier_none;
3091 bool can_reorder = true;
3092 Temp soffset = Temp(0, s1);
3093 };
3094
3095 using LoadCallback = Temp(*)(
3096 Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed,
3097 unsigned align, unsigned const_offset, Temp dst_hint);
3098
3099 template <LoadCallback callback, bool byte_align_loads, bool supports_8bit_16bit_loads, unsigned max_const_offset_plus_one>
3100 void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
3101 {
3102 unsigned load_size = info->num_components * info->component_size;
3103 unsigned component_size = info->component_size;
3104
3105 unsigned num_vals = 0;
3106 Temp vals[info->dst.bytes()];
3107
3108 unsigned const_offset = info->const_offset;
3109
3110 unsigned align_mul = info->align_mul ? info->align_mul : component_size;
3111 unsigned align_offset = (info->align_offset + const_offset) % align_mul;
3112
3113 unsigned bytes_read = 0;
3114 while (bytes_read < load_size) {
3115 unsigned bytes_needed = load_size - bytes_read;
3116
3117 /* add buffer for unaligned loads */
3118 int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3119
3120 if (byte_align) {
3121 if ((bytes_needed > 2 ||
3122 (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3123 !supports_8bit_16bit_loads) && byte_align_loads) {
3124 if (info->component_stride) {
3125 assert(supports_8bit_16bit_loads && "unimplemented");
3126 bytes_needed = 2;
3127 byte_align = 0;
3128 } else {
3129 bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align;
3130 bytes_needed = align(bytes_needed, 4);
3131 }
3132 } else {
3133 byte_align = 0;
3134 }
3135 }
3136
3137 if (info->swizzle_component_size)
3138 bytes_needed = MIN2(bytes_needed, info->swizzle_component_size);
3139 if (info->component_stride)
3140 bytes_needed = MIN2(bytes_needed, info->component_size);
3141
3142 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3143
3144 /* reduce constant offset */
3145 Operand offset = info->offset;
3146 unsigned reduced_const_offset = const_offset;
3147 bool remove_const_offset_completely = need_to_align_offset;
3148 if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) {
3149 unsigned to_add = const_offset;
3150 if (remove_const_offset_completely) {
3151 reduced_const_offset = 0;
3152 } else {
3153 to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one;
3154 reduced_const_offset %= max_const_offset_plus_one;
3155 }
3156 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3157 if (offset.isConstant()) {
3158 offset = Operand(offset.constantValue() + to_add);
3159 } else if (offset_tmp.regClass() == s1) {
3160 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3161 offset_tmp, Operand(to_add));
3162 } else if (offset_tmp.regClass() == v1) {
3163 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
3164 } else {
3165 Temp lo = bld.tmp(offset_tmp.type(), 1);
3166 Temp hi = bld.tmp(offset_tmp.type(), 1);
3167 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3168
3169 if (offset_tmp.regClass() == s2) {
3170 Temp carry = bld.tmp(s1);
3171 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
3172 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3173 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3174 } else {
3175 Temp new_lo = bld.tmp(v1);
3176 Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
3177 hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
3178 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3179 }
3180 }
3181 }
3182
3183 /* align offset down if needed */
3184 Operand aligned_offset = offset;
3185 if (need_to_align_offset) {
3186 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3187 if (offset.isConstant()) {
3188 aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
3189 } else if (offset_tmp.regClass() == s1) {
3190 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
3191 } else if (offset_tmp.regClass() == s2) {
3192 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
3193 } else if (offset_tmp.regClass() == v1) {
3194 aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
3195 } else if (offset_tmp.regClass() == v2) {
3196 Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3197 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3198 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo);
3199 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3200 }
3201 }
3202 Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
3203 bld.copy(bld.def(s1), aligned_offset);
3204
3205 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3206 Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3207 reduced_const_offset, byte_align ? Temp() : info->dst);
3208
3209 /* the callback wrote directly to dst */
3210 if (val == info->dst) {
3211 assert(num_vals == 0);
3212 emit_split_vector(ctx, info->dst, info->num_components);
3213 return;
3214 }
3215
3216 /* shift result right if needed */
3217 if (info->component_size < 4 && byte_align_loads) {
3218 Operand align((uint32_t)byte_align);
3219 if (byte_align == -1) {
3220 if (offset.isConstant())
3221 align = Operand(offset.constantValue() % 4u);
3222 else if (offset.size() == 2)
3223 align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
3224 else
3225 align = offset;
3226 }
3227
3228 assert(val.bytes() >= load_size && "unimplemented");
3229 if (val.type() == RegType::sgpr)
3230 byte_align_scalar(ctx, val, align, info->dst);
3231 else
3232 byte_align_vector(ctx, val, align, info->dst, component_size);
3233 return;
3234 }
3235
3236 /* add result to list and advance */
3237 if (info->component_stride) {
3238 assert(val.bytes() == info->component_size && "unimplemented");
3239 const_offset += info->component_stride;
3240 align_offset = (align_offset + info->component_stride) % align_mul;
3241 } else {
3242 const_offset += val.bytes();
3243 align_offset = (align_offset + val.bytes()) % align_mul;
3244 }
3245 bytes_read += val.bytes();
3246 vals[num_vals++] = val;
3247 }
3248
3249 /* create array of components */
3250 unsigned components_split = 0;
3251 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3252 bool has_vgprs = false;
3253 for (unsigned i = 0; i < num_vals;) {
3254 Temp tmp[num_vals];
3255 unsigned num_tmps = 0;
3256 unsigned tmp_size = 0;
3257 RegType reg_type = RegType::sgpr;
3258 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3259 if (vals[i].type() == RegType::vgpr)
3260 reg_type = RegType::vgpr;
3261 tmp_size += vals[i].bytes();
3262 tmp[num_tmps++] = vals[i++];
3263 }
3264 if (num_tmps > 1) {
3265 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3266 aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3267 for (unsigned i = 0; i < num_vals; i++)
3268 vec->operands[i] = Operand(tmp[i]);
3269 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3270 vec->definitions[0] = Definition(tmp[0]);
3271 bld.insert(std::move(vec));
3272 }
3273
3274 if (tmp[0].bytes() % component_size) {
3275 /* trim tmp[0] */
3276 assert(i == num_vals);
3277 RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3278 tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
3279 }
3280
3281 RegClass elem_rc = RegClass::get(reg_type, component_size);
3282
3283 unsigned start = components_split;
3284
3285 if (tmp_size == elem_rc.bytes()) {
3286 allocated_vec[components_split++] = tmp[0];
3287 } else {
3288 assert(tmp_size % elem_rc.bytes() == 0);
3289 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3290 aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3291 for (unsigned i = 0; i < split->definitions.size(); i++) {
3292 Temp component = bld.tmp(elem_rc);
3293 allocated_vec[components_split++] = component;
3294 split->definitions[i] = Definition(component);
3295 }
3296 split->operands[0] = Operand(tmp[0]);
3297 bld.insert(std::move(split));
3298 }
3299
3300 /* try to p_as_uniform early so we can create more optimizable code and
3301 * also update allocated_vec */
3302 for (unsigned j = start; j < components_split; j++) {
3303 if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr)
3304 allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3305 has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3306 }
3307 }
3308
3309 /* concatenate components and p_as_uniform() result if needed */
3310 if (info->dst.type() == RegType::vgpr || !has_vgprs)
3311 ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
3312
3313 int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
3314
3315 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3316 aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
3317 for (unsigned i = 0; i < info->num_components; i++)
3318 vec->operands[i] = Operand(allocated_vec[i]);
3319 if (padding_bytes)
3320 vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3321 if (info->dst.type() == RegType::sgpr && has_vgprs) {
3322 Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
3323 vec->definitions[0] = Definition(tmp);
3324 bld.insert(std::move(vec));
3325 bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
3326 } else {
3327 vec->definitions[0] = Definition(info->dst);
3328 bld.insert(std::move(vec));
3329 }
3330 }
3331
3332 Operand load_lds_size_m0(Builder& bld)
3333 {
3334 /* TODO: m0 does not need to be initialized on GFX9+ */
3335 return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
3336 }
3337
3338 Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
3339 Temp offset, unsigned bytes_needed,
3340 unsigned align, unsigned const_offset,
3341 Temp dst_hint)
3342 {
3343 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3344
3345 Operand m = load_lds_size_m0(bld);
3346
3347 bool large_ds_read = bld.program->chip_class >= GFX7;
3348 bool usable_read2 = bld.program->chip_class >= GFX7;
3349
3350 bool read2 = false;
3351 unsigned size = 0;
3352 aco_opcode op;
3353 //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
3354 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3355 size = 16;
3356 op = aco_opcode::ds_read_b128;
3357 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3358 size = 16;
3359 read2 = true;
3360 op = aco_opcode::ds_read2_b64;
3361 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3362 size = 12;
3363 op = aco_opcode::ds_read_b96;
3364 } else if (bytes_needed >= 8 && align % 8 == 0) {
3365 size = 8;
3366 op = aco_opcode::ds_read_b64;
3367 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
3368 size = 8;
3369 read2 = true;
3370 op = aco_opcode::ds_read2_b32;
3371 } else if (bytes_needed >= 4 && align % 4 == 0) {
3372 size = 4;
3373 op = aco_opcode::ds_read_b32;
3374 } else if (bytes_needed >= 2 && align % 2 == 0) {
3375 size = 2;
3376 op = aco_opcode::ds_read_u16;
3377 } else {
3378 size = 1;
3379 op = aco_opcode::ds_read_u8;
3380 }
3381
3382 unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
3383 if (const_offset >= max_offset_plus_one) {
3384 offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
3385 const_offset %= max_offset_plus_one;
3386 }
3387
3388 if (read2)
3389 const_offset /= (size / 2u);
3390
3391 RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
3392 Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3393 if (read2)
3394 bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3395 else
3396 bld.ds(op, Definition(val), offset, m, const_offset);
3397
3398 if (size < 4)
3399 val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
3400
3401 return val;
3402 }
3403
3404 static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
3405
3406 Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
3407 Temp offset, unsigned bytes_needed,
3408 unsigned align, unsigned const_offset,
3409 Temp dst_hint)
3410 {
3411 unsigned size = 0;
3412 aco_opcode op;
3413 if (bytes_needed <= 4) {
3414 size = 1;
3415 op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3416 } else if (bytes_needed <= 8) {
3417 size = 2;
3418 op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3419 } else if (bytes_needed <= 16) {
3420 size = 4;
3421 op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3422 } else if (bytes_needed <= 32) {
3423 size = 8;
3424 op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3425 } else {
3426 size = 16;
3427 op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3428 }
3429 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3430 if (info->resource.id()) {
3431 load->operands[0] = Operand(info->resource);
3432 load->operands[1] = Operand(offset);
3433 } else {
3434 load->operands[0] = Operand(offset);
3435 load->operands[1] = Operand(0u);
3436 }
3437 RegClass rc(RegType::sgpr, size);
3438 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3439 load->definitions[0] = Definition(val);
3440 load->glc = info->glc;
3441 load->dlc = info->glc && bld.program->chip_class >= GFX10;
3442 load->barrier = info->barrier;
3443 load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3444 bld.insert(std::move(load));
3445 return val;
3446 }
3447
3448 static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
3449
3450 Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
3451 Temp offset, unsigned bytes_needed,
3452 unsigned align_, unsigned const_offset,
3453 Temp dst_hint)
3454 {
3455 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3456 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3457
3458 if (info->soffset.id()) {
3459 if (soffset.isTemp())
3460 vaddr = bld.copy(bld.def(v1), soffset);
3461 soffset = Operand(info->soffset);
3462 }
3463
3464 unsigned bytes_size = 0;
3465 aco_opcode op;
3466 if (bytes_needed == 1) {
3467 bytes_size = 1;
3468 op = aco_opcode::buffer_load_ubyte;
3469 } else if (bytes_needed == 2) {
3470 bytes_size = 2;
3471 op = aco_opcode::buffer_load_ushort;
3472 } else if (bytes_needed <= 4) {
3473 bytes_size = 4;
3474 op = aco_opcode::buffer_load_dword;
3475 } else if (bytes_needed <= 8) {
3476 bytes_size = 8;
3477 op = aco_opcode::buffer_load_dwordx2;
3478 } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
3479 bytes_size = 12;
3480 op = aco_opcode::buffer_load_dwordx3;
3481 } else {
3482 bytes_size = 16;
3483 op = aco_opcode::buffer_load_dwordx4;
3484 }
3485 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3486 mubuf->operands[0] = Operand(info->resource);
3487 mubuf->operands[1] = vaddr;
3488 mubuf->operands[2] = soffset;
3489 mubuf->offen = (offset.type() == RegType::vgpr);
3490 mubuf->glc = info->glc;
3491 mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
3492 mubuf->barrier = info->barrier;
3493 mubuf->can_reorder = info->can_reorder;
3494 mubuf->offset = const_offset;
3495 RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3496 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3497 mubuf->definitions[0] = Definition(val);
3498 bld.insert(std::move(mubuf));
3499
3500 return val;
3501 }
3502
3503 static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
3504
3505 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
3506 {
3507 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3508 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3509
3510 if (addr.type() == RegType::vgpr)
3511 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
3512 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
3513 }
3514
3515 Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
3516 Temp offset, unsigned bytes_needed,
3517 unsigned align_, unsigned const_offset,
3518 Temp dst_hint)
3519 {
3520 unsigned bytes_size = 0;
3521 bool mubuf = bld.program->chip_class == GFX6;
3522 bool global = bld.program->chip_class >= GFX9;
3523 aco_opcode op;
3524 if (bytes_needed == 1) {
3525 bytes_size = 1;
3526 op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
3527 } else if (bytes_needed == 2) {
3528 bytes_size = 2;
3529 op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
3530 } else if (bytes_needed <= 4) {
3531 bytes_size = 4;
3532 op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
3533 } else if (bytes_needed <= 8) {
3534 bytes_size = 8;
3535 op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
3536 } else if (bytes_needed <= 12 && !mubuf) {
3537 bytes_size = 12;
3538 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
3539 } else {
3540 bytes_size = 16;
3541 op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
3542 }
3543 RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3544 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3545 if (mubuf) {
3546 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3547 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
3548 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3549 mubuf->operands[2] = Operand(0u);
3550 mubuf->glc = info->glc;
3551 mubuf->dlc = false;
3552 mubuf->offset = 0;
3553 mubuf->addr64 = offset.type() == RegType::vgpr;
3554 mubuf->disable_wqm = false;
3555 mubuf->barrier = info->barrier;
3556 mubuf->definitions[0] = Definition(val);
3557 bld.insert(std::move(mubuf));
3558 } else {
3559 offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
3560
3561 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
3562 flat->operands[0] = Operand(offset);
3563 flat->operands[1] = Operand(s1);
3564 flat->glc = info->glc;
3565 flat->dlc = info->glc && bld.program->chip_class >= GFX10;
3566 flat->barrier = info->barrier;
3567 flat->offset = 0u;
3568 flat->definitions[0] = Definition(val);
3569 bld.insert(std::move(flat));
3570 }
3571
3572 return val;
3573 }
3574
3575 static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
3576
3577 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
3578 Temp address, unsigned base_offset, unsigned align)
3579 {
3580 assert(util_is_power_of_two_nonzero(align));
3581
3582 Builder bld(ctx->program, ctx->block);
3583
3584 unsigned num_components = dst.bytes() / elem_size_bytes;
3585 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
3586 info.align_mul = align;
3587 info.align_offset = 0;
3588 info.barrier = barrier_shared;
3589 info.can_reorder = false;
3590 info.const_offset = base_offset;
3591 emit_lds_load(ctx, bld, &info);
3592
3593 return dst;
3594 }
3595
3596 void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
3597 {
3598 if (!count)
3599 return;
3600
3601 Builder bld(ctx->program, ctx->block);
3602
3603 ASSERTED bool is_subdword = false;
3604 for (unsigned i = 0; i < count; i++)
3605 is_subdword |= offsets[i] % 4;
3606 is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
3607 assert(!is_subdword || dst_type == RegType::vgpr);
3608
3609 /* count == 1 fast path */
3610 if (count == 1) {
3611 if (dst_type == RegType::sgpr)
3612 dst[0] = bld.as_uniform(src);
3613 else
3614 dst[0] = as_vgpr(ctx, src);
3615 return;
3616 }
3617
3618 for (unsigned i = 0; i < count - 1; i++)
3619 dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
3620 dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
3621
3622 if (is_subdword && src.type() == RegType::sgpr) {
3623 src = as_vgpr(ctx, src);
3624 } else {
3625 /* use allocated_vec if possible */
3626 auto it = ctx->allocated_vec.find(src.id());
3627 if (it != ctx->allocated_vec.end()) {
3628 unsigned total_size = 0;
3629 for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
3630 total_size += it->second[i].bytes();
3631 if (total_size != src.bytes())
3632 goto split;
3633
3634 unsigned elem_size = it->second[0].bytes();
3635
3636 for (unsigned i = 0; i < count; i++) {
3637 if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
3638 goto split;
3639 }
3640
3641 for (unsigned i = 0; i < count; i++) {
3642 unsigned start_idx = offsets[i] / elem_size;
3643 unsigned op_count = dst[i].bytes() / elem_size;
3644 if (op_count == 1) {
3645 if (dst_type == RegType::sgpr)
3646 dst[i] = bld.as_uniform(it->second[start_idx]);
3647 else
3648 dst[i] = as_vgpr(ctx, it->second[start_idx]);
3649 continue;
3650 }
3651
3652 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
3653 for (unsigned j = 0; j < op_count; j++) {
3654 Temp tmp = it->second[start_idx + j];
3655 if (dst_type == RegType::sgpr)
3656 tmp = bld.as_uniform(tmp);
3657 vec->operands[j] = Operand(tmp);
3658 }
3659 vec->definitions[0] = Definition(dst[i]);
3660 bld.insert(std::move(vec));
3661 }
3662 return;
3663 }
3664 }
3665
3666 if (dst_type == RegType::sgpr)
3667 src = bld.as_uniform(src);
3668
3669 split:
3670 /* just split it */
3671 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
3672 split->operands[0] = Operand(src);
3673 for (unsigned i = 0; i < count; i++)
3674 split->definitions[i] = Definition(dst[i]);
3675 bld.insert(std::move(split));
3676 }
3677
3678 bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
3679 int *start, int *count)
3680 {
3681 unsigned start_elem = ffs(todo_mask) - 1;
3682 bool skip = !(mask & (1 << start_elem));
3683 if (skip)
3684 mask = ~mask & todo_mask;
3685
3686 mask &= todo_mask;
3687
3688 u_bit_scan_consecutive_range(&mask, start, count);
3689
3690 return !skip;
3691 }
3692
3693 void advance_write_mask(uint32_t *todo_mask, int start, int count)
3694 {
3695 *todo_mask &= ~u_bit_consecutive(0, count) << start;
3696 }
3697
3698 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
3699 Temp address, unsigned base_offset, unsigned align)
3700 {
3701 assert(util_is_power_of_two_nonzero(align));
3702 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
3703
3704 Builder bld(ctx->program, ctx->block);
3705 bool large_ds_write = ctx->options->chip_class >= GFX7;
3706 bool usable_write2 = ctx->options->chip_class >= GFX7;
3707
3708 unsigned write_count = 0;
3709 Temp write_datas[32];
3710 unsigned offsets[32];
3711 aco_opcode opcodes[32];
3712
3713 wrmask = widen_mask(wrmask, elem_size_bytes);
3714
3715 uint32_t todo = u_bit_consecutive(0, data.bytes());
3716 while (todo) {
3717 int offset, bytes;
3718 if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
3719 offsets[write_count] = offset;
3720 opcodes[write_count] = aco_opcode::num_opcodes;
3721 write_count++;
3722 advance_write_mask(&todo, offset, bytes);
3723 continue;
3724 }
3725
3726 bool aligned2 = offset % 2 == 0 && align % 2 == 0;
3727 bool aligned4 = offset % 4 == 0 && align % 4 == 0;
3728 bool aligned8 = offset % 8 == 0 && align % 8 == 0;
3729 bool aligned16 = offset % 16 == 0 && align % 16 == 0;
3730
3731 //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
3732 aco_opcode op = aco_opcode::num_opcodes;
3733 if (bytes >= 16 && aligned16 && large_ds_write) {
3734 op = aco_opcode::ds_write_b128;
3735 bytes = 16;
3736 } else if (bytes >= 12 && aligned16 && large_ds_write) {
3737 op = aco_opcode::ds_write_b96;
3738 bytes = 12;
3739 } else if (bytes >= 8 && aligned8) {
3740 op = aco_opcode::ds_write_b64;
3741 bytes = 8;
3742 } else if (bytes >= 4 && aligned4) {
3743 op = aco_opcode::ds_write_b32;
3744 bytes = 4;
3745 } else if (bytes >= 2 && aligned2) {
3746 op = aco_opcode::ds_write_b16;
3747 bytes = 2;
3748 } else if (bytes >= 1) {
3749 op = aco_opcode::ds_write_b8;
3750 bytes = 1;
3751 } else {
3752 assert(false);
3753 }
3754
3755 offsets[write_count] = offset;
3756 opcodes[write_count] = op;
3757 write_count++;
3758 advance_write_mask(&todo, offset, bytes);
3759 }
3760
3761 Operand m = load_lds_size_m0(bld);
3762
3763 split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
3764
3765 for (unsigned i = 0; i < write_count; i++) {
3766 aco_opcode op = opcodes[i];
3767 if (op == aco_opcode::num_opcodes)
3768 continue;
3769
3770 Temp data = write_datas[i];
3771
3772 unsigned second = write_count;
3773 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
3774 for (second = i + 1; second < write_count; second++) {
3775 if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
3776 op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
3777 opcodes[second] = aco_opcode::num_opcodes;
3778 break;
3779 }
3780 }
3781 }
3782
3783 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
3784 unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
3785
3786 unsigned inline_offset = base_offset + offsets[i];
3787 unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
3788 Temp address_offset = address;
3789 if (inline_offset > max_offset) {
3790 address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3791 inline_offset = offsets[i];
3792 }
3793 assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
3794
3795 if (write2) {
3796 Temp second_data = write_datas[second];
3797 inline_offset /= data.bytes();
3798 bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
3799 } else {
3800 bld.ds(op, address_offset, data, m, inline_offset);
3801 }
3802 }
3803 }
3804
3805 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
3806 {
3807 unsigned align = 16;
3808 if (const_offset)
3809 align = std::min(align, 1u << (ffs(const_offset) - 1));
3810
3811 return align;
3812 }
3813
3814
3815 aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
3816 {
3817 switch (bytes) {
3818 case 1:
3819 assert(!smem);
3820 return aco_opcode::buffer_store_byte;
3821 case 2:
3822 assert(!smem);
3823 return aco_opcode::buffer_store_short;
3824 case 4:
3825 return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
3826 case 8:
3827 return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
3828 case 12:
3829 assert(!smem);
3830 return aco_opcode::buffer_store_dwordx3;
3831 case 16:
3832 return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
3833 }
3834 unreachable("Unexpected store size");
3835 return aco_opcode::num_opcodes;
3836 }
3837
3838 void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
3839 Temp data, unsigned writemask, int swizzle_element_size,
3840 unsigned *write_count, Temp *write_datas, unsigned *offsets)
3841 {
3842 unsigned write_count_with_skips = 0;
3843 bool skips[16];
3844
3845 /* determine how to split the data */
3846 unsigned todo = u_bit_consecutive(0, data.bytes());
3847 while (todo) {
3848 int offset, bytes;
3849 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
3850 offsets[write_count_with_skips] = offset;
3851 if (skips[write_count_with_skips]) {
3852 advance_write_mask(&todo, offset, bytes);
3853 write_count_with_skips++;
3854 continue;
3855 }
3856
3857 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
3858 * larger than swizzle_element_size */
3859 bytes = MIN2(bytes, swizzle_element_size);
3860 if (bytes % 4)
3861 bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
3862
3863 /* SMEM and GFX6 VMEM can't emit 12-byte stores */
3864 if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
3865 bytes = 8;
3866
3867 /* dword or larger stores have to be dword-aligned */
3868 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
3869 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
3870 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
3871 if (!dword_aligned)
3872 bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
3873
3874 advance_write_mask(&todo, offset, bytes);
3875 write_count_with_skips++;
3876 }
3877
3878 /* actually split data */
3879 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
3880
3881 /* remove skips */
3882 for (unsigned i = 0; i < write_count_with_skips; i++) {
3883 if (skips[i])
3884 continue;
3885 write_datas[*write_count] = write_datas[i];
3886 offsets[*write_count] = offsets[i];
3887 (*write_count)++;
3888 }
3889 }
3890
3891 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
3892 unsigned split_cnt = 0u, Temp dst = Temp())
3893 {
3894 Builder bld(ctx->program, ctx->block);
3895 unsigned dword_size = elem_size_bytes / 4;
3896
3897 if (!dst.id())
3898 dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
3899
3900 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3901 aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
3902 instr->definitions[0] = Definition(dst);
3903
3904 for (unsigned i = 0; i < cnt; ++i) {
3905 if (arr[i].id()) {
3906 assert(arr[i].size() == dword_size);
3907 allocated_vec[i] = arr[i];
3908 instr->operands[i] = Operand(arr[i]);
3909 } else {
3910 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
3911 allocated_vec[i] = zero;
3912 instr->operands[i] = Operand(zero);
3913 }
3914 }
3915
3916 bld.insert(std::move(instr));
3917
3918 if (split_cnt)
3919 emit_split_vector(ctx, dst, split_cnt);
3920 else
3921 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
3922
3923 return dst;
3924 }
3925
3926 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
3927 {
3928 if (const_offset >= 4096) {
3929 unsigned excess_const_offset = const_offset / 4096u * 4096u;
3930 const_offset %= 4096u;
3931
3932 if (!voffset.id())
3933 voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
3934 else if (unlikely(voffset.regClass() == s1))
3935 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
3936 else if (likely(voffset.regClass() == v1))
3937 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
3938 else
3939 unreachable("Unsupported register class of voffset");
3940 }
3941
3942 return const_offset;
3943 }
3944
3945 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
3946 unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
3947 {
3948 assert(vdata.id());
3949 assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
3950 assert(vdata.size() >= 1 && vdata.size() <= 4);
3951
3952 Builder bld(ctx->program, ctx->block);
3953 aco_opcode op = get_buffer_store_op(false, vdata.bytes());
3954 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
3955
3956 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
3957 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
3958 Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
3959 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
3960 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
3961
3962 static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
3963 }
3964
3965 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
3966 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
3967 bool allow_combining = true, bool reorder = true, bool slc = false)
3968 {
3969 Builder bld(ctx->program, ctx->block);
3970 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
3971 assert(write_mask);
3972 write_mask = widen_mask(write_mask, elem_size_bytes);
3973
3974 unsigned write_count = 0;
3975 Temp write_datas[32];
3976 unsigned offsets[32];
3977 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
3978 allow_combining ? 16 : 4, &write_count, write_datas, offsets);
3979
3980 for (unsigned i = 0; i < write_count; i++) {
3981 unsigned const_offset = offsets[i] + base_const_offset;
3982 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
3983 }
3984 }
3985
3986 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
3987 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
3988 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
3989 {
3990 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
3991 assert((num_components * elem_size_bytes) == dst.bytes());
3992 assert(!!stride != allow_combining);
3993
3994 Builder bld(ctx->program, ctx->block);
3995
3996 LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
3997 info.component_stride = allow_combining ? 0 : stride;
3998 info.glc = true;
3999 info.swizzle_component_size = allow_combining ? 0 : 4;
4000 info.align_mul = MIN2(elem_size_bytes, 4);
4001 info.align_offset = 0;
4002 info.soffset = soffset;
4003 info.const_offset = base_const_offset;
4004 emit_mubuf_load(ctx, bld, &info);
4005 }
4006
4007 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
4008 {
4009 Builder bld(ctx->program, ctx->block);
4010 Temp offset = base_offset.first;
4011 unsigned const_offset = base_offset.second;
4012
4013 if (!nir_src_is_const(*off_src)) {
4014 Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
4015 Temp with_stride;
4016
4017 /* Calculate indirect offset with stride */
4018 if (likely(indirect_offset_arg.regClass() == v1))
4019 with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride);
4020 else if (indirect_offset_arg.regClass() == s1)
4021 with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
4022 else
4023 unreachable("Unsupported register class of indirect offset");
4024
4025 /* Add to the supplied base offset */
4026 if (offset.id() == 0)
4027 offset = with_stride;
4028 else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
4029 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
4030 else if (offset.size() == 1 && with_stride.size() == 1)
4031 offset = bld.vadd32(bld.def(v1), with_stride, offset);
4032 else
4033 unreachable("Unsupported register class of indirect offset");
4034 } else {
4035 unsigned const_offset_arg = nir_src_as_uint(*off_src);
4036 const_offset += const_offset_arg * stride;
4037 }
4038
4039 return std::make_pair(offset, const_offset);
4040 }
4041
4042 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
4043 {
4044 Builder bld(ctx->program, ctx->block);
4045 Temp offset;
4046
4047 if (off1.first.id() && off2.first.id()) {
4048 if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
4049 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
4050 else if (off1.first.size() == 1 && off2.first.size() == 1)
4051 offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
4052 else
4053 unreachable("Unsupported register class of indirect offset");
4054 } else {
4055 offset = off1.first.id() ? off1.first : off2.first;
4056 }
4057
4058 return std::make_pair(offset, off1.second + off2.second);
4059 }
4060
4061 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
4062 {
4063 Builder bld(ctx->program, ctx->block);
4064 unsigned const_offset = offs.second * multiplier;
4065
4066 if (!offs.first.id())
4067 return std::make_pair(offs.first, const_offset);
4068
4069 Temp offset = unlikely(offs.first.regClass() == s1)
4070 ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
4071 : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier);
4072
4073 return std::make_pair(offset, const_offset);
4074 }
4075
4076 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
4077 {
4078 Builder bld(ctx->program, ctx->block);
4079
4080 /* base is the driver_location, which is already multiplied by 4, so is in dwords */
4081 unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
4082 /* component is in bytes */
4083 const_offset += nir_intrinsic_component(instr) * component_stride;
4084
4085 /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
4086 nir_src *off_src = nir_get_io_offset_src(instr);
4087 return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
4088 }
4089
4090 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
4091 {
4092 return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
4093 }
4094
4095 Temp get_tess_rel_patch_id(isel_context *ctx)
4096 {
4097 Builder bld(ctx->program, ctx->block);
4098
4099 switch (ctx->shader->info.stage) {
4100 case MESA_SHADER_TESS_CTRL:
4101 return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
4102 get_arg(ctx, ctx->args->ac.tcs_rel_ids));
4103 case MESA_SHADER_TESS_EVAL:
4104 return get_arg(ctx, ctx->args->tes_rel_patch_id);
4105 default:
4106 unreachable("Unsupported stage in get_tess_rel_patch_id");
4107 }
4108 }
4109
4110 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4111 {
4112 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4113 Builder bld(ctx->program, ctx->block);
4114
4115 uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
4116 uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
4117
4118 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
4119
4120 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4121 offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
4122
4123 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4124 Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
4125 offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
4126
4127 return offset_mul(ctx, offs, 4u);
4128 }
4129
4130 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
4131 {
4132 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4133 Builder bld(ctx->program, ctx->block);
4134
4135 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
4136 uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
4137 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4138 uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
4139
4140 std::pair<Temp, unsigned> offs = instr
4141 ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
4142 : std::make_pair(Temp(), 0u);
4143
4144 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4145 Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
4146
4147 if (per_vertex) {
4148 assert(instr);
4149
4150 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4151 offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
4152
4153 uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
4154 offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
4155 } else {
4156 uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
4157 offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
4158 }
4159
4160 return offs;
4161 }
4162
4163 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4164 {
4165 Builder bld(ctx->program, ctx->block);
4166
4167 unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
4168 unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
4169
4170 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
4171
4172 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4173 Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
4174 offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
4175
4176 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4177 offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
4178
4179 return offs;
4180 }
4181
4182 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
4183 {
4184 Builder bld(ctx->program, ctx->block);
4185
4186 unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
4187 unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4188 unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
4189 unsigned attr_stride = ctx->tcs_num_patches;
4190
4191 std::pair<Temp, unsigned> offs = instr
4192 ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
4193 : std::make_pair(Temp(), 0u);
4194
4195 if (const_base_offset)
4196 offs.second += const_base_offset * attr_stride;
4197
4198 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4199 Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
4200 offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
4201
4202 return offs;
4203 }
4204
4205 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
4206 {
4207 assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4208
4209 if (mask == 0)
4210 return false;
4211
4212 unsigned drv_loc = nir_intrinsic_base(instr);
4213 nir_src *off_src = nir_get_io_offset_src(instr);
4214
4215 if (!nir_src_is_const(*off_src)) {
4216 *indirect = true;
4217 return false;
4218 }
4219
4220 *indirect = false;
4221 uint64_t slot = per_vertex
4222 ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
4223 : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
4224 return (((uint64_t) 1) << slot) & mask;
4225 }
4226
4227 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
4228 {
4229 unsigned write_mask = nir_intrinsic_write_mask(instr);
4230 unsigned component = nir_intrinsic_component(instr);
4231 unsigned idx = nir_intrinsic_base(instr) + component;
4232
4233 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
4234 if (off_instr->type != nir_instr_type_load_const)
4235 return false;
4236
4237 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4238 idx += nir_src_as_uint(instr->src[1]) * 4u;
4239
4240 if (instr->src[0].ssa->bit_size == 64)
4241 write_mask = widen_mask(write_mask, 2);
4242
4243 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4244
4245 for (unsigned i = 0; i < 8; ++i) {
4246 if (write_mask & (1 << i)) {
4247 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4248 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4249 }
4250 idx++;
4251 }
4252
4253 return true;
4254 }
4255
4256 bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
4257 {
4258 /* Only TCS per-vertex inputs are supported by this function.
4259 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
4260 */
4261 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4262 return false;
4263
4264 nir_src *off_src = nir_get_io_offset_src(instr);
4265 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4266 nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
4267 bool can_use_temps = nir_src_is_const(*off_src) &&
4268 vertex_index_instr->type == nir_instr_type_intrinsic &&
4269 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4270
4271 if (!can_use_temps)
4272 return false;
4273
4274 unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
4275 Temp *src = &ctx->inputs.temps[idx];
4276 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4277
4278 return true;
4279 }
4280
4281 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
4282 {
4283 Builder bld(ctx->program, ctx->block);
4284
4285 if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
4286 /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
4287 bool indirect_write;
4288 bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
4289 if (temp_only_input && !indirect_write)
4290 return;
4291 }
4292
4293 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
4294 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4295 unsigned write_mask = nir_intrinsic_write_mask(instr);
4296 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
4297
4298 if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
4299 /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
4300 Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
4301 Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
4302 store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
4303 } else {
4304 Temp lds_base;
4305
4306 if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4307 /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
4308 unsigned itemsize = ctx->stage == vertex_geometry_gs
4309 ? ctx->program->info->vs.es_info.esgs_itemsize
4310 : ctx->program->info->tes.es_info.esgs_itemsize;
4311 Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
4312 Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
4313 Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
4314 bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
4315 lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
4316 } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
4317 /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
4318 * GFX9+: LS is merged into HS, but still uses the same LDS layout.
4319 */
4320 Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
4321 lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
4322 } else {
4323 unreachable("Invalid LS or ES stage");
4324 }
4325
4326 offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
4327 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4328 store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
4329 }
4330 }
4331
4332 bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4333 {
4334 if (per_vertex)
4335 return false;
4336
4337 unsigned off = nir_intrinsic_base(instr) * 4u;
4338 return off == ctx->tcs_tess_lvl_out_loc ||
4339 off == ctx->tcs_tess_lvl_in_loc;
4340
4341 }
4342
4343 bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4344 {
4345 uint64_t mask = per_vertex
4346 ? ctx->program->info->tcs.tes_inputs_read
4347 : ctx->program->info->tcs.tes_patch_inputs_read;
4348
4349 bool indirect_write = false;
4350 bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4351 return indirect_write || output_read_by_tes;
4352 }
4353
4354 bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4355 {
4356 uint64_t mask = per_vertex
4357 ? ctx->shader->info.outputs_read
4358 : ctx->shader->info.patch_outputs_read;
4359
4360 bool indirect_write = false;
4361 bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4362 return indirect_write || output_read;
4363 }
4364
4365 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4366 {
4367 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4368 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4369
4370 Builder bld(ctx->program, ctx->block);
4371
4372 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
4373 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4374 unsigned write_mask = nir_intrinsic_write_mask(instr);
4375
4376 bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
4377 bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
4378 bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
4379
4380 if (write_to_vmem) {
4381 std::pair<Temp, unsigned> vmem_offs = per_vertex
4382 ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
4383 : get_tcs_per_patch_output_vmem_offset(ctx, instr);
4384
4385 Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4386 Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
4387 store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
4388 }
4389
4390 if (write_to_lds) {
4391 std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4392 unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4393 store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
4394 }
4395 }
4396
4397 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4398 {
4399 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4400 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4401
4402 Builder bld(ctx->program, ctx->block);
4403
4404 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4405 std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4406 unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4407 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4408
4409 load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
4410 }
4411
4412 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
4413 {
4414 if (ctx->stage == vertex_vs ||
4415 ctx->stage == tess_eval_vs ||
4416 ctx->stage == fragment_fs ||
4417 ctx->stage == ngg_vertex_gs ||
4418 ctx->stage == ngg_tess_eval_gs ||
4419 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4420 bool stored_to_temps = store_output_to_temps(ctx, instr);
4421 if (!stored_to_temps) {
4422 fprintf(stderr, "Unimplemented output offset instruction:\n");
4423 nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
4424 fprintf(stderr, "\n");
4425 abort();
4426 }
4427 } else if (ctx->stage == vertex_es ||
4428 ctx->stage == vertex_ls ||
4429 ctx->stage == tess_eval_es ||
4430 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4431 (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4432 (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
4433 visit_store_ls_or_es_output(ctx, instr);
4434 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
4435 visit_store_tcs_output(ctx, instr, false);
4436 } else {
4437 unreachable("Shader stage not implemented");
4438 }
4439 }
4440
4441 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
4442 {
4443 visit_load_tcs_output(ctx, instr, false);
4444 }
4445
4446 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
4447 {
4448 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4449 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4450
4451 Builder bld(ctx->program, ctx->block);
4452
4453 if (dst.regClass() == v2b) {
4454 if (ctx->program->has_16bank_lds) {
4455 assert(ctx->options->chip_class <= GFX8);
4456 Builder::Result interp_p1 =
4457 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
4458 Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
4459 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
4460 coord1, bld.m0(prim_mask), interp_p1, idx, component);
4461 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
4462 bld.m0(prim_mask), interp_p1, idx, component);
4463 } else {
4464 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4465
4466 if (ctx->options->chip_class == GFX8)
4467 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4468
4469 Builder::Result interp_p1 =
4470 bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
4471 coord1, bld.m0(prim_mask), idx, component);
4472 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
4473 interp_p1, idx, component);
4474 }
4475 } else {
4476 Builder::Result interp_p1 =
4477 bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4478 bld.m0(prim_mask), idx, component);
4479
4480 if (ctx->program->has_16bank_lds)
4481 interp_p1.instr->operands[0].setLateKill(true);
4482
4483 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
4484 bld.m0(prim_mask), interp_p1, idx, component);
4485 }
4486 }
4487
4488 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
4489 {
4490 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4491 for (unsigned i = 0; i < num_components; i++)
4492 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4493 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4494 assert(num_components == 4);
4495 Builder bld(ctx->program, ctx->block);
4496 vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4497 }
4498
4499 for (Operand& op : vec->operands)
4500 op = op.isUndefined() ? Operand(0u) : op;
4501
4502 vec->definitions[0] = Definition(dst);
4503 ctx->block->instructions.emplace_back(std::move(vec));
4504 emit_split_vector(ctx, dst, num_components);
4505 return;
4506 }
4507
4508 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
4509 {
4510 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4511 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4512 unsigned idx = nir_intrinsic_base(instr);
4513 unsigned component = nir_intrinsic_component(instr);
4514 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4515
4516 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
4517 if (offset) {
4518 assert(offset->u32 == 0);
4519 } else {
4520 /* the lower 15bit of the prim_mask contain the offset into LDS
4521 * while the upper bits contain the number of prims */
4522 Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
4523 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4524 Builder bld(ctx->program, ctx->block);
4525 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4526 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4527 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4528 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4529 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4530 }
4531
4532 if (instr->dest.ssa.num_components == 1) {
4533 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4534 } else {
4535 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4536 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
4537 {
4538 Temp tmp = {ctx->program->allocateId(), v1};
4539 emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
4540 vec->operands[i] = Operand(tmp);
4541 }
4542 vec->definitions[0] = Definition(dst);
4543 ctx->block->instructions.emplace_back(std::move(vec));
4544 }
4545 }
4546
4547 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
4548 unsigned offset, unsigned stride, unsigned channels)
4549 {
4550 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4551 if (vtx_info->chan_byte_size != 4 && channels == 3)
4552 return false;
4553 return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
4554 (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
4555 }
4556
4557 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
4558 unsigned offset, unsigned stride, unsigned *channels)
4559 {
4560 if (!vtx_info->chan_byte_size) {
4561 *channels = vtx_info->num_channels;
4562 return vtx_info->chan_format;
4563 }
4564
4565 unsigned num_channels = *channels;
4566 if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
4567 unsigned new_channels = num_channels + 1;
4568 /* first, assume more loads is worse and try using a larger data format */
4569 while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
4570 new_channels++;
4571 /* don't make the attribute potentially out-of-bounds */
4572 if (offset + new_channels * vtx_info->chan_byte_size > stride)
4573 new_channels = 5;
4574 }
4575
4576 if (new_channels == 5) {
4577 /* then try decreasing load size (at the cost of more loads) */
4578 new_channels = *channels;
4579 while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
4580 new_channels--;
4581 }
4582
4583 if (new_channels < *channels)
4584 *channels = new_channels;
4585 num_channels = new_channels;
4586 }
4587
4588 switch (vtx_info->chan_format) {
4589 case V_008F0C_BUF_DATA_FORMAT_8:
4590 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4591 V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4592 case V_008F0C_BUF_DATA_FORMAT_16:
4593 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4594 V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4595 case V_008F0C_BUF_DATA_FORMAT_32:
4596 return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4597 V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4598 }
4599 unreachable("shouldn't reach here");
4600 return V_008F0C_BUF_DATA_FORMAT_INVALID;
4601 }
4602
4603 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4604 * so we may need to fix it up. */
4605 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
4606 {
4607 Builder bld(ctx->program, ctx->block);
4608
4609 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
4610 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4611
4612 /* For the integer-like cases, do a natural sign extension.
4613 *
4614 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4615 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4616 * exponent.
4617 */
4618 alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
4619 alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
4620
4621 /* Convert back to the right type. */
4622 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
4623 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4624 Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
4625 alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
4626 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
4627 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4628 }
4629
4630 return alpha;
4631 }
4632
4633 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
4634 {
4635 Builder bld(ctx->program, ctx->block);
4636 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4637 if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4638
4639 nir_instr *off_instr = instr->src[0].ssa->parent_instr;
4640 if (off_instr->type != nir_instr_type_load_const) {
4641 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4642 nir_print_instr(off_instr, stderr);
4643 fprintf(stderr, "\n");
4644 }
4645 uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
4646
4647 Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
4648
4649 unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
4650 unsigned component = nir_intrinsic_component(instr);
4651 unsigned bitsize = instr->dest.ssa.bit_size;
4652 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4653 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4654 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4655 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4656
4657 unsigned dfmt = attrib_format & 0xf;
4658 unsigned nfmt = (attrib_format >> 4) & 0x7;
4659 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4660
4661 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4662 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4663 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
4664 bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4665 if (post_shuffle)
4666 num_channels = MAX2(num_channels, 3);
4667
4668 Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
4669 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4670
4671 Temp index;
4672 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4673 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4674 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4675 if (divisor) {
4676 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4677 if (divisor != 1) {
4678 Temp divided = bld.tmp(v1);
4679 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
4680 index = bld.vadd32(bld.def(v1), start_instance, divided);
4681 } else {
4682 index = bld.vadd32(bld.def(v1), start_instance, instance_id);
4683 }
4684 } else {
4685 index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
4686 }
4687 } else {
4688 index = bld.vadd32(bld.def(v1),
4689 get_arg(ctx, ctx->args->ac.base_vertex),
4690 get_arg(ctx, ctx->args->ac.vertex_id));
4691 }
4692
4693 Temp channels[num_channels];
4694 unsigned channel_start = 0;
4695 bool direct_fetch = false;
4696
4697 /* skip unused channels at the start */
4698 if (vtx_info->chan_byte_size && !post_shuffle) {
4699 channel_start = ffs(mask) - 1;
4700 for (unsigned i = 0; i < channel_start; i++)
4701 channels[i] = Temp(0, s1);
4702 } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
4703 num_channels = 3 - (ffs(mask) - 1);
4704 }
4705
4706 /* load channels */
4707 while (channel_start < num_channels) {
4708 unsigned fetch_component = num_channels - channel_start;
4709 unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
4710 bool expanded = false;
4711
4712 /* use MUBUF when possible to avoid possible alignment issues */
4713 /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
4714 bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
4715 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
4716 nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
4717 vtx_info->chan_byte_size == 4;
4718 unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
4719 if (!use_mubuf) {
4720 fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
4721 } else {
4722 if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
4723 /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
4724 fetch_component = 4;
4725 expanded = true;
4726 }
4727 }
4728
4729 unsigned fetch_bytes = fetch_component * bitsize / 8;
4730
4731 Temp fetch_index = index;
4732 if (attrib_stride != 0 && fetch_offset > attrib_stride) {
4733 fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
4734 fetch_offset = fetch_offset % attrib_stride;
4735 }
4736
4737 Operand soffset(0u);
4738 if (fetch_offset >= 4096) {
4739 soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
4740 fetch_offset %= 4096;
4741 }
4742
4743 aco_opcode opcode;
4744 switch (fetch_bytes) {
4745 case 2:
4746 assert(!use_mubuf && bitsize == 16);
4747 opcode = aco_opcode::tbuffer_load_format_d16_x;
4748 break;
4749 case 4:
4750 if (bitsize == 16) {
4751 assert(!use_mubuf);
4752 opcode = aco_opcode::tbuffer_load_format_d16_xy;
4753 } else {
4754 opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
4755 }
4756 break;
4757 case 6:
4758 assert(!use_mubuf && bitsize == 16);
4759 opcode = aco_opcode::tbuffer_load_format_d16_xyz;
4760 break;
4761 case 8:
4762 if (bitsize == 16) {
4763 assert(!use_mubuf);
4764 opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
4765 } else {
4766 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
4767 }
4768 break;
4769 case 12:
4770 assert(ctx->options->chip_class >= GFX7 ||
4771 (!use_mubuf && ctx->options->chip_class == GFX6));
4772 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
4773 break;
4774 case 16:
4775 opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
4776 break;
4777 default:
4778 unreachable("Unimplemented load_input vector size");
4779 }
4780
4781 Temp fetch_dst;
4782 if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
4783 !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
4784 num_channels <= 3)) {
4785 direct_fetch = true;
4786 fetch_dst = dst;
4787 } else {
4788 fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
4789 }
4790
4791 if (use_mubuf) {
4792 Instruction *mubuf = bld.mubuf(opcode,
4793 Definition(fetch_dst), list, fetch_index, soffset,
4794 fetch_offset, false, true).instr;
4795 static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
4796 } else {
4797 Instruction *mtbuf = bld.mtbuf(opcode,
4798 Definition(fetch_dst), list, fetch_index, soffset,
4799 fetch_dfmt, nfmt, fetch_offset, false, true).instr;
4800 static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
4801 }
4802
4803 emit_split_vector(ctx, fetch_dst, fetch_dst.size());
4804
4805 if (fetch_component == 1) {
4806 channels[channel_start] = fetch_dst;
4807 } else {
4808 for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
4809 channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
4810 bitsize == 16 ? v2b : v1);
4811 }
4812
4813 channel_start += fetch_component;
4814 }
4815
4816 if (!direct_fetch) {
4817 bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
4818 nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4819
4820 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4821 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4822 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4823
4824 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4825 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4826 unsigned num_temp = 0;
4827 for (unsigned i = 0; i < dst.size(); i++) {
4828 unsigned idx = i + component;
4829 if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
4830 Temp channel = channels[swizzle[idx]];
4831 if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
4832 channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
4833 vec->operands[i] = Operand(channel);
4834
4835 num_temp++;
4836 elems[i] = channel;
4837 } else if (is_float && idx == 3) {
4838 vec->operands[i] = Operand(0x3f800000u);
4839 } else if (!is_float && idx == 3) {
4840 vec->operands[i] = Operand(1u);
4841 } else {
4842 vec->operands[i] = Operand(0u);
4843 }
4844 }
4845 vec->definitions[0] = Definition(dst);
4846 ctx->block->instructions.emplace_back(std::move(vec));
4847 emit_split_vector(ctx, dst, dst.size());
4848
4849 if (num_temp == dst.size())
4850 ctx->allocated_vec.emplace(dst.id(), elems);
4851 }
4852 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
4853 unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
4854 nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
4855 if (off_instr->type != nir_instr_type_load_const ||
4856 nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
4857 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4858 nir_print_instr(off_instr, stderr);
4859 fprintf(stderr, "\n");
4860 }
4861
4862 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4863 nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
4864 if (offset) {
4865 assert(offset->u32 == 0);
4866 } else {
4867 /* the lower 15bit of the prim_mask contain the offset into LDS
4868 * while the upper bits contain the number of prims */
4869 Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
4870 assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4871 Builder bld(ctx->program, ctx->block);
4872 Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4873 stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4874 stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4875 offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4876 prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4877 }
4878
4879 unsigned idx = nir_intrinsic_base(instr);
4880 unsigned component = nir_intrinsic_component(instr);
4881 unsigned vertex_id = 2; /* P0 */
4882
4883 if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
4884 nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
4885 switch (src0->u32) {
4886 case 0:
4887 vertex_id = 2; /* P0 */
4888 break;
4889 case 1:
4890 vertex_id = 0; /* P10 */
4891 break;
4892 case 2:
4893 vertex_id = 1; /* P20 */
4894 break;
4895 default:
4896 unreachable("invalid vertex index");
4897 }
4898 }
4899
4900 if (dst.size() == 1) {
4901 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
4902 } else {
4903 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4904 for (unsigned i = 0; i < dst.size(); i++)
4905 vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
4906 vec->definitions[0] = Definition(dst);
4907 bld.insert(std::move(vec));
4908 }
4909
4910 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
4911 Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4912 Temp soffset = get_arg(ctx, ctx->args->oc_lds);
4913 std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
4914 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
4915
4916 load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
4917 } else {
4918 unreachable("Shader stage not implemented");
4919 }
4920 }
4921
4922 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
4923 {
4924 assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4925
4926 Builder bld(ctx->program, ctx->block);
4927 nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
4928 Temp vertex_offset;
4929
4930 if (!nir_src_is_const(*vertex_src)) {
4931 /* better code could be created, but this case probably doesn't happen
4932 * much in practice */
4933 Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
4934 for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
4935 Temp elem;
4936
4937 if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4938 elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
4939 if (i % 2u)
4940 elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
4941 } else {
4942 elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
4943 }
4944
4945 if (vertex_offset.id()) {
4946 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
4947 Operand(i), indirect_vertex);
4948 vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
4949 } else {
4950 vertex_offset = elem;
4951 }
4952 }
4953
4954 if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4955 vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
4956 } else {
4957 unsigned vertex = nir_src_as_uint(*vertex_src);
4958 if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4959 vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
4960 get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
4961 Operand((vertex % 2u) * 16u), Operand(16u));
4962 else
4963 vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
4964 }
4965
4966 std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
4967 offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
4968 return offset_mul(ctx, offs, 4u);
4969 }
4970
4971 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4972 {
4973 assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4974
4975 Builder bld(ctx->program, ctx->block);
4976 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4977 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4978
4979 if (ctx->stage == geometry_gs) {
4980 std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
4981 Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
4982 load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
4983 } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4984 std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
4985 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4986 load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
4987 } else {
4988 unreachable("Unsupported GS stage.");
4989 }
4990 }
4991
4992 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4993 {
4994 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4995
4996 Builder bld(ctx->program, ctx->block);
4997 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4998
4999 if (load_input_from_temps(ctx, instr, dst))
5000 return;
5001
5002 std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
5003 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5004 unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5005
5006 load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5007 }
5008
5009 void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5010 {
5011 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5012
5013 Builder bld(ctx->program, ctx->block);
5014
5015 Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
5016 Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
5017 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5018
5019 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5020 std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
5021
5022 load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
5023 }
5024
5025 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5026 {
5027 switch (ctx->shader->info.stage) {
5028 case MESA_SHADER_GEOMETRY:
5029 visit_load_gs_per_vertex_input(ctx, instr);
5030 break;
5031 case MESA_SHADER_TESS_CTRL:
5032 visit_load_tcs_per_vertex_input(ctx, instr);
5033 break;
5034 case MESA_SHADER_TESS_EVAL:
5035 visit_load_tes_per_vertex_input(ctx, instr);
5036 break;
5037 default:
5038 unreachable("Unimplemented shader stage");
5039 }
5040 }
5041
5042 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5043 {
5044 visit_load_tcs_output(ctx, instr, true);
5045 }
5046
5047 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5048 {
5049 assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
5050 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5051
5052 visit_store_tcs_output(ctx, instr, true);
5053 }
5054
5055 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
5056 {
5057 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5058
5059 Builder bld(ctx->program, ctx->block);
5060 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5061
5062 Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5063 Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5064 Operand tes_w(0u);
5065
5066 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5067 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5068 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
5069 tes_w = Operand(tmp);
5070 }
5071
5072 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5073 emit_split_vector(ctx, tess_coord, 3);
5074 }
5075
5076 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
5077 {
5078 if (ctx->program->info->need_indirect_descriptor_sets) {
5079 Builder bld(ctx->program, ctx->block);
5080 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5081 Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
5082 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
5083 }
5084
5085 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5086 }
5087
5088
5089 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
5090 {
5091 Builder bld(ctx->program, ctx->block);
5092 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5093 if (!nir_dest_is_divergent(instr->dest))
5094 index = bld.as_uniform(index);
5095 unsigned desc_set = nir_intrinsic_desc_set(instr);
5096 unsigned binding = nir_intrinsic_binding(instr);
5097
5098 Temp desc_ptr;
5099 radv_pipeline_layout *pipeline_layout = ctx->options->layout;
5100 radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
5101 unsigned offset = layout->binding[binding].offset;
5102 unsigned stride;
5103 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5104 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5105 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
5106 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5107 offset = pipeline_layout->push_constant_size + 16 * idx;
5108 stride = 16;
5109 } else {
5110 desc_ptr = load_desc_ptr(ctx, desc_set);
5111 stride = layout->binding[binding].size;
5112 }
5113
5114 nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
5115 unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
5116 if (stride != 1) {
5117 if (nir_const_index) {
5118 const_index = const_index * stride;
5119 } else if (index.type() == RegType::vgpr) {
5120 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5121 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5122 } else {
5123 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
5124 }
5125 }
5126 if (offset) {
5127 if (nir_const_index) {
5128 const_index = const_index + offset;
5129 } else if (index.type() == RegType::vgpr) {
5130 index = bld.vadd32(bld.def(v1), Operand(offset), index);
5131 } else {
5132 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
5133 }
5134 }
5135
5136 if (nir_const_index && const_index == 0) {
5137 index = desc_ptr;
5138 } else if (index.type() == RegType::vgpr) {
5139 index = bld.vadd32(bld.def(v1),
5140 nir_const_index ? Operand(const_index) : Operand(index),
5141 Operand(desc_ptr));
5142 } else {
5143 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5144 nir_const_index ? Operand(const_index) : Operand(index),
5145 Operand(desc_ptr));
5146 }
5147
5148 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
5149 }
5150
5151 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
5152 Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5153 bool glc=false, bool readonly=true, bool allow_smem=true)
5154 {
5155 Builder bld(ctx->program, ctx->block);
5156
5157 bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5158 if (use_smem)
5159 offset = bld.as_uniform(offset);
5160
5161 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5162 info.glc = glc;
5163 info.barrier = readonly ? barrier_none : barrier_buffer;
5164 info.can_reorder = readonly;
5165 info.align_mul = align_mul;
5166 info.align_offset = align_offset;
5167 if (use_smem)
5168 emit_smem_load(ctx, bld, &info);
5169 else
5170 emit_mubuf_load(ctx, bld, &info);
5171 }
5172
5173 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
5174 {
5175 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5176 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5177
5178 Builder bld(ctx->program, ctx->block);
5179
5180 nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
5181 unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
5182 unsigned binding = nir_intrinsic_binding(idx_instr);
5183 radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
5184
5185 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
5186 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5187 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5188 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5189 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5190 if (ctx->options->chip_class >= GFX10) {
5191 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5192 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5193 S_008F0C_RESOURCE_LEVEL(1);
5194 } else {
5195 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5196 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5197 }
5198 Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
5199 Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5200 Operand(0xFFFFFFFFu),
5201 Operand(desc_type));
5202 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5203 rsrc, upper_dwords);
5204 } else {
5205 rsrc = convert_pointer_to_64_bit(ctx, rsrc);
5206 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5207 }
5208 unsigned size = instr->dest.ssa.bit_size / 8;
5209 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5210 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5211 }
5212
5213 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5214 {
5215 Builder bld(ctx->program, ctx->block);
5216 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5217 unsigned offset = nir_intrinsic_base(instr);
5218 unsigned count = instr->dest.ssa.num_components;
5219 nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
5220
5221 if (index_cv && instr->dest.ssa.bit_size == 32) {
5222 unsigned start = (offset + index_cv->u32) / 4u;
5223 start -= ctx->args->ac.base_inline_push_consts;
5224 if (start + count <= ctx->args->ac.num_inline_push_consts) {
5225 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5226 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5227 for (unsigned i = 0; i < count; ++i) {
5228 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5229 vec->operands[i] = Operand{elems[i]};
5230 }
5231 vec->definitions[0] = Definition(dst);
5232 ctx->block->instructions.emplace_back(std::move(vec));
5233 ctx->allocated_vec.emplace(dst.id(), elems);
5234 return;
5235 }
5236 }
5237
5238 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5239 if (offset != 0) // TODO check if index != 0 as well
5240 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
5241 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5242 Temp vec = dst;
5243 bool trim = false;
5244 bool aligned = true;
5245
5246 if (instr->dest.ssa.bit_size == 8) {
5247 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5248 bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5249 if (!aligned)
5250 vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5251 } else if (instr->dest.ssa.bit_size == 16) {
5252 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5253 if (!aligned)
5254 vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5255 }
5256
5257 aco_opcode op;
5258
5259 switch (vec.size()) {
5260 case 1:
5261 op = aco_opcode::s_load_dword;
5262 break;
5263 case 2:
5264 op = aco_opcode::s_load_dwordx2;
5265 break;
5266 case 3:
5267 vec = bld.tmp(s4);
5268 trim = true;
5269 case 4:
5270 op = aco_opcode::s_load_dwordx4;
5271 break;
5272 case 6:
5273 vec = bld.tmp(s8);
5274 trim = true;
5275 case 8:
5276 op = aco_opcode::s_load_dwordx8;
5277 break;
5278 default:
5279 unreachable("unimplemented or forbidden load_push_constant.");
5280 }
5281
5282 bld.smem(op, Definition(vec), ptr, index);
5283
5284 if (!aligned) {
5285 Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
5286 byte_align_scalar(ctx, vec, byte_offset, dst);
5287 return;
5288 }
5289
5290 if (trim) {
5291 emit_split_vector(ctx, vec, 4);
5292 RegClass rc = dst.size() == 3 ? s1 : s2;
5293 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5294 emit_extract_vector(ctx, vec, 0, rc),
5295 emit_extract_vector(ctx, vec, 1, rc),
5296 emit_extract_vector(ctx, vec, 2, rc));
5297
5298 }
5299 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5300 }
5301
5302 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5303 {
5304 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5305
5306 Builder bld(ctx->program, ctx->block);
5307
5308 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5309 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5310 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5311 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5312 if (ctx->options->chip_class >= GFX10) {
5313 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5314 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5315 S_008F0C_RESOURCE_LEVEL(1);
5316 } else {
5317 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5318 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5319 }
5320
5321 unsigned base = nir_intrinsic_base(instr);
5322 unsigned range = nir_intrinsic_range(instr);
5323
5324 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5325 if (base && offset.type() == RegType::sgpr)
5326 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
5327 else if (base && offset.type() == RegType::vgpr)
5328 offset = bld.vadd32(bld.def(v1), Operand(base), offset);
5329
5330 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5331 bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
5332 Operand(MIN2(base + range, ctx->shader->constant_data_size)),
5333 Operand(desc_type));
5334 unsigned size = instr->dest.ssa.bit_size / 8;
5335 // TODO: get alignment information for subdword constants
5336 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5337 }
5338
5339 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
5340 {
5341 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5342 ctx->cf_info.exec_potentially_empty_discard = true;
5343
5344 ctx->program->needs_exact = true;
5345
5346 // TODO: optimize uniform conditions
5347 Builder bld(ctx->program, ctx->block);
5348 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5349 assert(src.regClass() == bld.lm);
5350 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5351 bld.pseudo(aco_opcode::p_discard_if, src);
5352 ctx->block->kind |= block_kind_uses_discard_if;
5353 return;
5354 }
5355
5356 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
5357 {
5358 Builder bld(ctx->program, ctx->block);
5359
5360 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5361 ctx->cf_info.exec_potentially_empty_discard = true;
5362
5363 bool divergent = ctx->cf_info.parent_if.is_divergent ||
5364 ctx->cf_info.parent_loop.has_divergent_continue;
5365
5366 if (ctx->block->loop_nest_depth &&
5367 ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
5368 /* we handle discards the same way as jump instructions */
5369 append_logical_end(ctx->block);
5370
5371 /* in loops, discard behaves like break */
5372 Block *linear_target = ctx->cf_info.parent_loop.exit;
5373 ctx->block->kind |= block_kind_discard;
5374
5375 if (!divergent) {
5376 /* uniform discard - loop ends here */
5377 assert(nir_instr_is_last(&instr->instr));
5378 ctx->block->kind |= block_kind_uniform;
5379 ctx->cf_info.has_branch = true;
5380 bld.branch(aco_opcode::p_branch);
5381 add_linear_edge(ctx->block->index, linear_target);
5382 return;
5383 }
5384
5385 /* we add a break right behind the discard() instructions */
5386 ctx->block->kind |= block_kind_break;
5387 unsigned idx = ctx->block->index;
5388
5389 ctx->cf_info.parent_loop.has_divergent_branch = true;
5390 ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
5391
5392 /* remove critical edges from linear CFG */
5393 bld.branch(aco_opcode::p_branch);
5394 Block* break_block = ctx->program->create_and_insert_block();
5395 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5396 break_block->kind |= block_kind_uniform;
5397 add_linear_edge(idx, break_block);
5398 add_linear_edge(break_block->index, linear_target);
5399 bld.reset(break_block);
5400 bld.branch(aco_opcode::p_branch);
5401
5402 Block* continue_block = ctx->program->create_and_insert_block();
5403 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5404 add_linear_edge(idx, continue_block);
5405 append_logical_start(continue_block);
5406 ctx->block = continue_block;
5407
5408 return;
5409 }
5410
5411 /* it can currently happen that NIR doesn't remove the unreachable code */
5412 if (!nir_instr_is_last(&instr->instr)) {
5413 ctx->program->needs_exact = true;
5414 /* save exec somewhere temporarily so that it doesn't get
5415 * overwritten before the discard from outer exec masks */
5416 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
5417 bld.pseudo(aco_opcode::p_discard_if, cond);
5418 ctx->block->kind |= block_kind_uses_discard_if;
5419 return;
5420 }
5421
5422 /* This condition is incorrect for uniformly branched discards in a loop
5423 * predicated by a divergent condition, but the above code catches that case
5424 * and the discard would end up turning into a discard_if.
5425 * For example:
5426 * if (divergent) {
5427 * while (...) {
5428 * if (uniform) {
5429 * discard;
5430 * }
5431 * }
5432 * }
5433 */
5434 if (!ctx->cf_info.parent_if.is_divergent) {
5435 /* program just ends here */
5436 ctx->block->kind |= block_kind_uniform;
5437 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
5438 0 /* enabled mask */, 9 /* dest */,
5439 false /* compressed */, true/* done */, true /* valid mask */);
5440 bld.sopp(aco_opcode::s_endpgm);
5441 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5442 } else {
5443 ctx->block->kind |= block_kind_discard;
5444 /* branch and linear edge is added by visit_if() */
5445 }
5446 }
5447
5448 enum aco_descriptor_type {
5449 ACO_DESC_IMAGE,
5450 ACO_DESC_FMASK,
5451 ACO_DESC_SAMPLER,
5452 ACO_DESC_BUFFER,
5453 ACO_DESC_PLANE_0,
5454 ACO_DESC_PLANE_1,
5455 ACO_DESC_PLANE_2,
5456 };
5457
5458 static bool
5459 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
5460 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5461 return false;
5462 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5463 return dim == ac_image_cube ||
5464 dim == ac_image_1darray ||
5465 dim == ac_image_2darray ||
5466 dim == ac_image_2darraymsaa;
5467 }
5468
5469 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
5470 enum aco_descriptor_type desc_type,
5471 const nir_tex_instr *tex_instr, bool image, bool write)
5472 {
5473 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5474 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
5475 if (it != ctx->tex_desc.end())
5476 return it->second;
5477 */
5478 Temp index = Temp();
5479 bool index_set = false;
5480 unsigned constant_index = 0;
5481 unsigned descriptor_set;
5482 unsigned base_index;
5483 Builder bld(ctx->program, ctx->block);
5484
5485 if (!deref_instr) {
5486 assert(tex_instr && !image);
5487 descriptor_set = 0;
5488 base_index = tex_instr->sampler_index;
5489 } else {
5490 while(deref_instr->deref_type != nir_deref_type_var) {
5491 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5492 if (!array_size)
5493 array_size = 1;
5494
5495 assert(deref_instr->deref_type == nir_deref_type_array);
5496 nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
5497 if (const_value) {
5498 constant_index += array_size * const_value->u32;
5499 } else {
5500 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5501 if (indirect.type() == RegType::vgpr)
5502 indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
5503
5504 if (array_size != 1)
5505 indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
5506
5507 if (!index_set) {
5508 index = indirect;
5509 index_set = true;
5510 } else {
5511 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5512 }
5513 }
5514
5515 deref_instr = nir_src_as_deref(deref_instr->parent);
5516 }
5517 descriptor_set = deref_instr->var->data.descriptor_set;
5518 base_index = deref_instr->var->data.binding;
5519 }
5520
5521 Temp list = load_desc_ptr(ctx, descriptor_set);
5522 list = convert_pointer_to_64_bit(ctx, list);
5523
5524 struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
5525 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
5526 unsigned offset = binding->offset;
5527 unsigned stride = binding->size;
5528 aco_opcode opcode;
5529 RegClass type;
5530
5531 assert(base_index < layout->binding_count);
5532
5533 switch (desc_type) {
5534 case ACO_DESC_IMAGE:
5535 type = s8;
5536 opcode = aco_opcode::s_load_dwordx8;
5537 break;
5538 case ACO_DESC_FMASK:
5539 type = s8;
5540 opcode = aco_opcode::s_load_dwordx8;
5541 offset += 32;
5542 break;
5543 case ACO_DESC_SAMPLER:
5544 type = s4;
5545 opcode = aco_opcode::s_load_dwordx4;
5546 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5547 offset += radv_combined_image_descriptor_sampler_offset(binding);
5548 break;
5549 case ACO_DESC_BUFFER:
5550 type = s4;
5551 opcode = aco_opcode::s_load_dwordx4;
5552 break;
5553 case ACO_DESC_PLANE_0:
5554 case ACO_DESC_PLANE_1:
5555 type = s8;
5556 opcode = aco_opcode::s_load_dwordx8;
5557 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5558 break;
5559 case ACO_DESC_PLANE_2:
5560 type = s4;
5561 opcode = aco_opcode::s_load_dwordx4;
5562 offset += 64;
5563 break;
5564 default:
5565 unreachable("invalid desc_type\n");
5566 }
5567
5568 offset += constant_index * stride;
5569
5570 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5571 (!index_set || binding->immutable_samplers_equal)) {
5572 if (binding->immutable_samplers_equal)
5573 constant_index = 0;
5574
5575 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
5576 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5577 Operand(samplers[constant_index * 4 + 0]),
5578 Operand(samplers[constant_index * 4 + 1]),
5579 Operand(samplers[constant_index * 4 + 2]),
5580 Operand(samplers[constant_index * 4 + 3]));
5581 }
5582
5583 Operand off;
5584 if (!index_set) {
5585 off = bld.copy(bld.def(s1), Operand(offset));
5586 } else {
5587 off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
5588 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
5589 }
5590
5591 Temp res = bld.smem(opcode, bld.def(type), list, off);
5592
5593 if (desc_type == ACO_DESC_PLANE_2) {
5594 Temp components[8];
5595 for (unsigned i = 0; i < 8; i++)
5596 components[i] = bld.tmp(s1);
5597 bld.pseudo(aco_opcode::p_split_vector,
5598 Definition(components[0]),
5599 Definition(components[1]),
5600 Definition(components[2]),
5601 Definition(components[3]),
5602 res);
5603
5604 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
5605 bld.pseudo(aco_opcode::p_split_vector,
5606 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5607 Definition(components[4]),
5608 Definition(components[5]),
5609 Definition(components[6]),
5610 Definition(components[7]),
5611 desc2);
5612
5613 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
5614 components[0], components[1], components[2], components[3],
5615 components[4], components[5], components[6], components[7]);
5616 }
5617
5618 return res;
5619 }
5620
5621 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5622 {
5623 switch (dim) {
5624 case GLSL_SAMPLER_DIM_BUF:
5625 return 1;
5626 case GLSL_SAMPLER_DIM_1D:
5627 return array ? 2 : 1;
5628 case GLSL_SAMPLER_DIM_2D:
5629 return array ? 3 : 2;
5630 case GLSL_SAMPLER_DIM_MS:
5631 return array ? 4 : 3;
5632 case GLSL_SAMPLER_DIM_3D:
5633 case GLSL_SAMPLER_DIM_CUBE:
5634 return 3;
5635 case GLSL_SAMPLER_DIM_RECT:
5636 case GLSL_SAMPLER_DIM_SUBPASS:
5637 return 2;
5638 case GLSL_SAMPLER_DIM_SUBPASS_MS:
5639 return 3;
5640 default:
5641 break;
5642 }
5643 return 0;
5644 }
5645
5646
5647 /* Adjust the sample index according to FMASK.
5648 *
5649 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5650 * which is the identity mapping. Each nibble says which physical sample
5651 * should be fetched to get that sample.
5652 *
5653 * For example, 0x11111100 means there are only 2 samples stored and
5654 * the second sample covers 3/4 of the pixel. When reading samples 0
5655 * and 1, return physical sample 0 (determined by the first two 0s
5656 * in FMASK), otherwise return physical sample 1.
5657 *
5658 * The sample index should be adjusted as follows:
5659 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
5660 */
5661 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
5662 {
5663 Builder bld(ctx->program, ctx->block);
5664 Temp fmask = bld.tmp(v1);
5665 unsigned dim = ctx->options->chip_class >= GFX10
5666 ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5667 : 0;
5668
5669 Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
5670 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
5671 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
5672 load->operands[0] = Operand(fmask_desc_ptr);
5673 load->operands[1] = Operand(s4); /* no sampler */
5674 load->operands[2] = Operand(coord);
5675 load->definitions[0] = Definition(fmask);
5676 load->glc = false;
5677 load->dlc = false;
5678 load->dmask = 0x1;
5679 load->unrm = true;
5680 load->da = da;
5681 load->dim = dim;
5682 load->can_reorder = true; /* fmask images shouldn't be modified */
5683 ctx->block->instructions.emplace_back(std::move(load));
5684
5685 Operand sample_index4;
5686 if (sample_index.isConstant()) {
5687 if (sample_index.constantValue() < 16) {
5688 sample_index4 = Operand(sample_index.constantValue() << 2);
5689 } else {
5690 sample_index4 = Operand(0u);
5691 }
5692 } else if (sample_index.regClass() == s1) {
5693 sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
5694 } else {
5695 assert(sample_index.regClass() == v1);
5696 sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
5697 }
5698
5699 Temp final_sample;
5700 if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
5701 final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
5702 else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
5703 final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
5704 else
5705 final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
5706
5707 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
5708 * resource descriptor is 0 (invalid),
5709 */
5710 Temp compare = bld.tmp(bld.lm);
5711 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
5712 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
5713
5714 Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
5715
5716 /* Replace the MSAA sample index. */
5717 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
5718 }
5719
5720 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
5721 {
5722
5723 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5724 enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5725 bool is_array = glsl_sampler_type_is_array(type);
5726 ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5727 assert(!add_frag_pos && "Input attachments should be lowered.");
5728 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5729 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5730 int count = image_type_to_components_count(dim, is_array);
5731 std::vector<Temp> coords(count);
5732 Builder bld(ctx->program, ctx->block);
5733
5734 if (is_ms) {
5735 count--;
5736 Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
5737 /* get sample index */
5738 if (instr->intrinsic == nir_intrinsic_image_deref_load) {
5739 nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
5740 Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
5741 std::vector<Temp> fmask_load_address;
5742 for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
5743 fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
5744
5745 Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
5746 coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
5747 } else {
5748 coords[count] = emit_extract_vector(ctx, src2, 0, v1);
5749 }
5750 }
5751
5752 if (gfx9_1d) {
5753 coords[0] = emit_extract_vector(ctx, src0, 0, v1);
5754 coords.resize(coords.size() + 1);
5755 coords[1] = bld.copy(bld.def(v1), Operand(0u));
5756 if (is_array)
5757 coords[2] = emit_extract_vector(ctx, src0, 1, v1);
5758 } else {
5759 for (int i = 0; i < count; i++)
5760 coords[i] = emit_extract_vector(ctx, src0, i, v1);
5761 }
5762
5763 if (instr->intrinsic == nir_intrinsic_image_deref_load ||
5764 instr->intrinsic == nir_intrinsic_image_deref_store) {
5765 int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
5766 bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
5767
5768 if (!level_zero)
5769 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
5770 }
5771
5772 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5773 for (unsigned i = 0; i < coords.size(); i++)
5774 vec->operands[i] = Operand(coords[i]);
5775 Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
5776 vec->definitions[0] = Definition(res);
5777 ctx->block->instructions.emplace_back(std::move(vec));
5778 return res;
5779 }
5780
5781
5782 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
5783 {
5784 Builder bld(ctx->program, ctx->block);
5785 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5786 const struct glsl_type *type = glsl_without_array(var->type);
5787 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5788 bool is_array = glsl_sampler_type_is_array(type);
5789 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5790
5791 if (dim == GLSL_SAMPLER_DIM_BUF) {
5792 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
5793 unsigned num_channels = util_last_bit(mask);
5794 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5795 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5796
5797 aco_opcode opcode;
5798 switch (num_channels) {
5799 case 1:
5800 opcode = aco_opcode::buffer_load_format_x;
5801 break;
5802 case 2:
5803 opcode = aco_opcode::buffer_load_format_xy;
5804 break;
5805 case 3:
5806 opcode = aco_opcode::buffer_load_format_xyz;
5807 break;
5808 case 4:
5809 opcode = aco_opcode::buffer_load_format_xyzw;
5810 break;
5811 default:
5812 unreachable(">4 channel buffer image load");
5813 }
5814 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
5815 load->operands[0] = Operand(rsrc);
5816 load->operands[1] = Operand(vindex);
5817 load->operands[2] = Operand((uint32_t) 0);
5818 Temp tmp;
5819 if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5820 tmp = dst;
5821 else
5822 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
5823 load->definitions[0] = Definition(tmp);
5824 load->idxen = true;
5825 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5826 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5827 load->barrier = barrier_image;
5828 ctx->block->instructions.emplace_back(std::move(load));
5829
5830 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
5831 return;
5832 }
5833
5834 Temp coords = get_image_coords(ctx, instr, type);
5835 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5836
5837 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
5838 unsigned num_components = util_bitcount(dmask);
5839 Temp tmp;
5840 if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5841 tmp = dst;
5842 else
5843 tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
5844
5845 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
5846 aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
5847
5848 aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
5849 load->operands[0] = Operand(resource);
5850 load->operands[1] = Operand(s4); /* no sampler */
5851 load->operands[2] = Operand(coords);
5852 load->definitions[0] = Definition(tmp);
5853 load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
5854 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5855 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5856 load->dmask = dmask;
5857 load->unrm = true;
5858 load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5859 load->barrier = barrier_image;
5860 ctx->block->instructions.emplace_back(std::move(load));
5861
5862 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
5863 return;
5864 }
5865
5866 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
5867 {
5868 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5869 const struct glsl_type *type = glsl_without_array(var->type);
5870 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5871 bool is_array = glsl_sampler_type_is_array(type);
5872 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5873
5874 bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
5875
5876 if (dim == GLSL_SAMPLER_DIM_BUF) {
5877 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5878 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5879 aco_opcode opcode;
5880 switch (data.size()) {
5881 case 1:
5882 opcode = aco_opcode::buffer_store_format_x;
5883 break;
5884 case 2:
5885 opcode = aco_opcode::buffer_store_format_xy;
5886 break;
5887 case 3:
5888 opcode = aco_opcode::buffer_store_format_xyz;
5889 break;
5890 case 4:
5891 opcode = aco_opcode::buffer_store_format_xyzw;
5892 break;
5893 default:
5894 unreachable(">4 channel buffer image store");
5895 }
5896 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
5897 store->operands[0] = Operand(rsrc);
5898 store->operands[1] = Operand(vindex);
5899 store->operands[2] = Operand((uint32_t) 0);
5900 store->operands[3] = Operand(data);
5901 store->idxen = true;
5902 store->glc = glc;
5903 store->dlc = false;
5904 store->disable_wqm = true;
5905 store->barrier = barrier_image;
5906 ctx->program->needs_exact = true;
5907 ctx->block->instructions.emplace_back(std::move(store));
5908 return;
5909 }
5910
5911 assert(data.type() == RegType::vgpr);
5912 Temp coords = get_image_coords(ctx, instr, type);
5913 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5914
5915 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
5916 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
5917
5918 aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
5919 store->operands[0] = Operand(resource);
5920 store->operands[1] = Operand(data);
5921 store->operands[2] = Operand(coords);
5922 store->glc = glc;
5923 store->dlc = false;
5924 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5925 store->dmask = (1 << data.size()) - 1;
5926 store->unrm = true;
5927 store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5928 store->disable_wqm = true;
5929 store->barrier = barrier_image;
5930 ctx->program->needs_exact = true;
5931 ctx->block->instructions.emplace_back(std::move(store));
5932 return;
5933 }
5934
5935 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5936 {
5937 /* return the previous value if dest is ever used */
5938 bool return_previous = false;
5939 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5940 return_previous = true;
5941 break;
5942 }
5943 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5944 return_previous = true;
5945 break;
5946 }
5947
5948 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5949 const struct glsl_type *type = glsl_without_array(var->type);
5950 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5951 bool is_array = glsl_sampler_type_is_array(type);
5952 Builder bld(ctx->program, ctx->block);
5953
5954 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5955 assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
5956
5957 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
5958 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
5959
5960 aco_opcode buf_op, image_op;
5961 switch (instr->intrinsic) {
5962 case nir_intrinsic_image_deref_atomic_add:
5963 buf_op = aco_opcode::buffer_atomic_add;
5964 image_op = aco_opcode::image_atomic_add;
5965 break;
5966 case nir_intrinsic_image_deref_atomic_umin:
5967 buf_op = aco_opcode::buffer_atomic_umin;
5968 image_op = aco_opcode::image_atomic_umin;
5969 break;
5970 case nir_intrinsic_image_deref_atomic_imin:
5971 buf_op = aco_opcode::buffer_atomic_smin;
5972 image_op = aco_opcode::image_atomic_smin;
5973 break;
5974 case nir_intrinsic_image_deref_atomic_umax:
5975 buf_op = aco_opcode::buffer_atomic_umax;
5976 image_op = aco_opcode::image_atomic_umax;
5977 break;
5978 case nir_intrinsic_image_deref_atomic_imax:
5979 buf_op = aco_opcode::buffer_atomic_smax;
5980 image_op = aco_opcode::image_atomic_smax;
5981 break;
5982 case nir_intrinsic_image_deref_atomic_and:
5983 buf_op = aco_opcode::buffer_atomic_and;
5984 image_op = aco_opcode::image_atomic_and;
5985 break;
5986 case nir_intrinsic_image_deref_atomic_or:
5987 buf_op = aco_opcode::buffer_atomic_or;
5988 image_op = aco_opcode::image_atomic_or;
5989 break;
5990 case nir_intrinsic_image_deref_atomic_xor:
5991 buf_op = aco_opcode::buffer_atomic_xor;
5992 image_op = aco_opcode::image_atomic_xor;
5993 break;
5994 case nir_intrinsic_image_deref_atomic_exchange:
5995 buf_op = aco_opcode::buffer_atomic_swap;
5996 image_op = aco_opcode::image_atomic_swap;
5997 break;
5998 case nir_intrinsic_image_deref_atomic_comp_swap:
5999 buf_op = aco_opcode::buffer_atomic_cmpswap;
6000 image_op = aco_opcode::image_atomic_cmpswap;
6001 break;
6002 default:
6003 unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
6004 }
6005
6006 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6007
6008 if (dim == GLSL_SAMPLER_DIM_BUF) {
6009 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6010 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
6011 //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
6012 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6013 mubuf->operands[0] = Operand(resource);
6014 mubuf->operands[1] = Operand(vindex);
6015 mubuf->operands[2] = Operand((uint32_t)0);
6016 mubuf->operands[3] = Operand(data);
6017 if (return_previous)
6018 mubuf->definitions[0] = Definition(dst);
6019 mubuf->offset = 0;
6020 mubuf->idxen = true;
6021 mubuf->glc = return_previous;
6022 mubuf->dlc = false; /* Not needed for atomics */
6023 mubuf->disable_wqm = true;
6024 mubuf->barrier = barrier_image;
6025 ctx->program->needs_exact = true;
6026 ctx->block->instructions.emplace_back(std::move(mubuf));
6027 return;
6028 }
6029
6030 Temp coords = get_image_coords(ctx, instr, type);
6031 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
6032 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
6033 mimg->operands[0] = Operand(resource);
6034 mimg->operands[1] = Operand(data);
6035 mimg->operands[2] = Operand(coords);
6036 if (return_previous)
6037 mimg->definitions[0] = Definition(dst);
6038 mimg->glc = return_previous;
6039 mimg->dlc = false; /* Not needed for atomics */
6040 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6041 mimg->dmask = (1 << data.size()) - 1;
6042 mimg->unrm = true;
6043 mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6044 mimg->disable_wqm = true;
6045 mimg->barrier = barrier_image;
6046 ctx->program->needs_exact = true;
6047 ctx->block->instructions.emplace_back(std::move(mimg));
6048 return;
6049 }
6050
6051 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
6052 {
6053 if (in_elements && ctx->options->chip_class == GFX8) {
6054 /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6055 Builder bld(ctx->program, ctx->block);
6056
6057 Temp size = emit_extract_vector(ctx, desc, 2, s1);
6058
6059 Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
6060 size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
6061
6062 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6063 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
6064
6065 Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
6066 size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6067
6068 Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6069 bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
6070 size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6071 if (dst.type() == RegType::vgpr)
6072 bld.copy(Definition(dst), shr_dst);
6073
6074 /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6075 } else {
6076 emit_extract_vector(ctx, desc, 2, dst);
6077 }
6078 }
6079
6080 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
6081 {
6082 const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6083 const struct glsl_type *type = glsl_without_array(var->type);
6084 const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6085 bool is_array = glsl_sampler_type_is_array(type);
6086 Builder bld(ctx->program, ctx->block);
6087
6088 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6089 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
6090 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
6091 }
6092
6093 /* LOD */
6094 Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6095
6096 /* Resource */
6097 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
6098
6099 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6100
6101 aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
6102 mimg->operands[0] = Operand(resource);
6103 mimg->operands[1] = Operand(s4); /* no sampler */
6104 mimg->operands[2] = Operand(lod);
6105 uint8_t& dmask = mimg->dmask;
6106 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6107 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6108 mimg->da = glsl_sampler_type_is_array(type);
6109 mimg->can_reorder = true;
6110 Definition& def = mimg->definitions[0];
6111 ctx->block->instructions.emplace_back(std::move(mimg));
6112
6113 if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
6114 glsl_sampler_type_is_array(type)) {
6115
6116 assert(instr->dest.ssa.num_components == 3);
6117 Temp tmp = {ctx->program->allocateId(), v3};
6118 def = Definition(tmp);
6119 emit_split_vector(ctx, tmp, 3);
6120
6121 /* divide 3rd value by 6 by multiplying with magic number */
6122 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6123 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6124
6125 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6126 emit_extract_vector(ctx, tmp, 0, v1),
6127 emit_extract_vector(ctx, tmp, 1, v1),
6128 by_6);
6129
6130 } else if (ctx->options->chip_class == GFX9 &&
6131 glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6132 glsl_sampler_type_is_array(type)) {
6133 assert(instr->dest.ssa.num_components == 2);
6134 def = Definition(dst);
6135 dmask = 0x5;
6136 } else {
6137 def = Definition(dst);
6138 }
6139
6140 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6141 }
6142
6143 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6144 {
6145 Builder bld(ctx->program, ctx->block);
6146 unsigned num_components = instr->num_components;
6147
6148 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6149 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6150 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6151
6152 unsigned access = nir_intrinsic_access(instr);
6153 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6154 unsigned size = instr->dest.ssa.bit_size / 8;
6155
6156 uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
6157 /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6158 * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6159 */
6160 bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
6161 allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
6162
6163 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6164 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
6165 }
6166
6167 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6168 {
6169 Builder bld(ctx->program, ctx->block);
6170 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6171 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6172 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6173 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6174
6175 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6176 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6177
6178 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6179 uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
6180 /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6181 * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6182 */
6183 bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
6184
6185 bool smem = !nir_src_is_divergent(instr->src[2]) &&
6186 ctx->options->chip_class >= GFX8 &&
6187 (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
6188 allow_smem;
6189 if (smem)
6190 offset = bld.as_uniform(offset);
6191 bool smem_nonfs = smem && ctx->stage != fragment_fs;
6192
6193 unsigned write_count = 0;
6194 Temp write_datas[32];
6195 unsigned offsets[32];
6196 split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
6197 data, writemask, 16, &write_count, write_datas, offsets);
6198
6199 for (unsigned i = 0; i < write_count; i++) {
6200 aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
6201 if (smem && ctx->stage == fragment_fs)
6202 op = aco_opcode::p_fs_buffer_store_smem;
6203
6204 if (smem) {
6205 aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
6206 store->operands[0] = Operand(rsrc);
6207 if (offsets[i]) {
6208 Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
6209 offset, Operand(offsets[i]));
6210 store->operands[1] = Operand(off);
6211 } else {
6212 store->operands[1] = Operand(offset);
6213 }
6214 if (op != aco_opcode::p_fs_buffer_store_smem)
6215 store->operands[1].setFixed(m0);
6216 store->operands[2] = Operand(write_datas[i]);
6217 store->glc = glc;
6218 store->dlc = false;
6219 store->disable_wqm = true;
6220 store->barrier = barrier_buffer;
6221 ctx->block->instructions.emplace_back(std::move(store));
6222 ctx->program->wb_smem_l1_on_end = true;
6223 if (op == aco_opcode::p_fs_buffer_store_smem) {
6224 ctx->block->kind |= block_kind_needs_lowering;
6225 ctx->program->needs_exact = true;
6226 }
6227 } else {
6228 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6229 store->operands[0] = Operand(rsrc);
6230 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6231 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6232 store->operands[3] = Operand(write_datas[i]);
6233 store->offset = offsets[i];
6234 store->offen = (offset.type() == RegType::vgpr);
6235 store->glc = glc;
6236 store->dlc = false;
6237 store->disable_wqm = true;
6238 store->barrier = barrier_buffer;
6239 ctx->program->needs_exact = true;
6240 ctx->block->instructions.emplace_back(std::move(store));
6241 }
6242 }
6243 }
6244
6245 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6246 {
6247 /* return the previous value if dest is ever used */
6248 bool return_previous = false;
6249 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6250 return_previous = true;
6251 break;
6252 }
6253 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6254 return_previous = true;
6255 break;
6256 }
6257
6258 Builder bld(ctx->program, ctx->block);
6259 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6260
6261 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6262 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6263 get_ssa_temp(ctx, instr->src[3].ssa), data);
6264
6265 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6266 Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6267 rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6268
6269 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6270
6271 aco_opcode op32, op64;
6272 switch (instr->intrinsic) {
6273 case nir_intrinsic_ssbo_atomic_add:
6274 op32 = aco_opcode::buffer_atomic_add;
6275 op64 = aco_opcode::buffer_atomic_add_x2;
6276 break;
6277 case nir_intrinsic_ssbo_atomic_imin:
6278 op32 = aco_opcode::buffer_atomic_smin;
6279 op64 = aco_opcode::buffer_atomic_smin_x2;
6280 break;
6281 case nir_intrinsic_ssbo_atomic_umin:
6282 op32 = aco_opcode::buffer_atomic_umin;
6283 op64 = aco_opcode::buffer_atomic_umin_x2;
6284 break;
6285 case nir_intrinsic_ssbo_atomic_imax:
6286 op32 = aco_opcode::buffer_atomic_smax;
6287 op64 = aco_opcode::buffer_atomic_smax_x2;
6288 break;
6289 case nir_intrinsic_ssbo_atomic_umax:
6290 op32 = aco_opcode::buffer_atomic_umax;
6291 op64 = aco_opcode::buffer_atomic_umax_x2;
6292 break;
6293 case nir_intrinsic_ssbo_atomic_and:
6294 op32 = aco_opcode::buffer_atomic_and;
6295 op64 = aco_opcode::buffer_atomic_and_x2;
6296 break;
6297 case nir_intrinsic_ssbo_atomic_or:
6298 op32 = aco_opcode::buffer_atomic_or;
6299 op64 = aco_opcode::buffer_atomic_or_x2;
6300 break;
6301 case nir_intrinsic_ssbo_atomic_xor:
6302 op32 = aco_opcode::buffer_atomic_xor;
6303 op64 = aco_opcode::buffer_atomic_xor_x2;
6304 break;
6305 case nir_intrinsic_ssbo_atomic_exchange:
6306 op32 = aco_opcode::buffer_atomic_swap;
6307 op64 = aco_opcode::buffer_atomic_swap_x2;
6308 break;
6309 case nir_intrinsic_ssbo_atomic_comp_swap:
6310 op32 = aco_opcode::buffer_atomic_cmpswap;
6311 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6312 break;
6313 default:
6314 unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6315 }
6316 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6317 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6318 mubuf->operands[0] = Operand(rsrc);
6319 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6320 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6321 mubuf->operands[3] = Operand(data);
6322 if (return_previous)
6323 mubuf->definitions[0] = Definition(dst);
6324 mubuf->offset = 0;
6325 mubuf->offen = (offset.type() == RegType::vgpr);
6326 mubuf->glc = return_previous;
6327 mubuf->dlc = false; /* Not needed for atomics */
6328 mubuf->disable_wqm = true;
6329 mubuf->barrier = barrier_buffer;
6330 ctx->program->needs_exact = true;
6331 ctx->block->instructions.emplace_back(std::move(mubuf));
6332 }
6333
6334 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
6335
6336 Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6337 Builder bld(ctx->program, ctx->block);
6338 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
6339 get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
6340 }
6341
6342 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
6343 {
6344 Builder bld(ctx->program, ctx->block);
6345 unsigned num_components = instr->num_components;
6346 unsigned component_size = instr->dest.ssa.bit_size / 8;
6347
6348 LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6349 get_ssa_temp(ctx, &instr->dest.ssa),
6350 num_components, component_size};
6351 info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6352 info.align_mul = nir_intrinsic_align_mul(instr);
6353 info.align_offset = nir_intrinsic_align_offset(instr);
6354 info.barrier = barrier_buffer;
6355 info.can_reorder = false;
6356 /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6357 * it's safe to use SMEM */
6358 bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6359 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
6360 emit_global_load(ctx, bld, &info);
6361 } else {
6362 info.offset = Operand(bld.as_uniform(info.offset));
6363 emit_smem_load(ctx, bld, &info);
6364 }
6365 }
6366
6367 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
6368 {
6369 Builder bld(ctx->program, ctx->block);
6370 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6371 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6372
6373 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6374 Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6375 bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6376
6377 if (ctx->options->chip_class >= GFX7)
6378 addr = as_vgpr(ctx, addr);
6379
6380 unsigned write_count = 0;
6381 Temp write_datas[32];
6382 unsigned offsets[32];
6383 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6384 16, &write_count, write_datas, offsets);
6385
6386 for (unsigned i = 0; i < write_count; i++) {
6387 if (ctx->options->chip_class >= GFX7) {
6388 unsigned offset = offsets[i];
6389 Temp store_addr = addr;
6390 if (offset > 0 && ctx->options->chip_class < GFX9) {
6391 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6392 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6393 Temp carry = bld.tmp(bld.lm);
6394 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6395
6396 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
6397 Operand(offset), addr0);
6398 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6399 Operand(0u), addr1,
6400 carry).def(1).setHint(vcc);
6401
6402 store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6403
6404 offset = 0;
6405 }
6406
6407 bool global = ctx->options->chip_class >= GFX9;
6408 aco_opcode op;
6409 switch (write_datas[i].bytes()) {
6410 case 1:
6411 op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
6412 break;
6413 case 2:
6414 op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
6415 break;
6416 case 4:
6417 op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
6418 break;
6419 case 8:
6420 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6421 break;
6422 case 12:
6423 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6424 break;
6425 case 16:
6426 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6427 break;
6428 default:
6429 unreachable("store_global not implemented for this size.");
6430 }
6431
6432 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6433 flat->operands[0] = Operand(store_addr);
6434 flat->operands[1] = Operand(s1);
6435 flat->operands[2] = Operand(write_datas[i]);
6436 flat->glc = glc;
6437 flat->dlc = false;
6438 flat->offset = offset;
6439 flat->disable_wqm = true;
6440 flat->barrier = barrier_buffer;
6441 ctx->program->needs_exact = true;
6442 ctx->block->instructions.emplace_back(std::move(flat));
6443 } else {
6444 assert(ctx->options->chip_class == GFX6);
6445
6446 aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6447
6448 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6449
6450 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6451 mubuf->operands[0] = Operand(rsrc);
6452 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6453 mubuf->operands[2] = Operand(0u);
6454 mubuf->operands[3] = Operand(write_datas[i]);
6455 mubuf->glc = glc;
6456 mubuf->dlc = false;
6457 mubuf->offset = offsets[i];
6458 mubuf->addr64 = addr.type() == RegType::vgpr;
6459 mubuf->disable_wqm = true;
6460 mubuf->barrier = barrier_buffer;
6461 ctx->program->needs_exact = true;
6462 ctx->block->instructions.emplace_back(std::move(mubuf));
6463 }
6464 }
6465 }
6466
6467 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6468 {
6469 /* return the previous value if dest is ever used */
6470 bool return_previous = false;
6471 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6472 return_previous = true;
6473 break;
6474 }
6475 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6476 return_previous = true;
6477 break;
6478 }
6479
6480 Builder bld(ctx->program, ctx->block);
6481 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6482 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6483
6484 if (ctx->options->chip_class >= GFX7)
6485 addr = as_vgpr(ctx, addr);
6486
6487 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6488 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6489 get_ssa_temp(ctx, instr->src[2].ssa), data);
6490
6491 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6492
6493 aco_opcode op32, op64;
6494
6495 if (ctx->options->chip_class >= GFX7) {
6496 bool global = ctx->options->chip_class >= GFX9;
6497 switch (instr->intrinsic) {
6498 case nir_intrinsic_global_atomic_add:
6499 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6500 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6501 break;
6502 case nir_intrinsic_global_atomic_imin:
6503 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6504 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6505 break;
6506 case nir_intrinsic_global_atomic_umin:
6507 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6508 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6509 break;
6510 case nir_intrinsic_global_atomic_imax:
6511 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6512 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6513 break;
6514 case nir_intrinsic_global_atomic_umax:
6515 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6516 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6517 break;
6518 case nir_intrinsic_global_atomic_and:
6519 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6520 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6521 break;
6522 case nir_intrinsic_global_atomic_or:
6523 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6524 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6525 break;
6526 case nir_intrinsic_global_atomic_xor:
6527 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6528 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6529 break;
6530 case nir_intrinsic_global_atomic_exchange:
6531 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6532 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6533 break;
6534 case nir_intrinsic_global_atomic_comp_swap:
6535 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6536 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6537 break;
6538 default:
6539 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6540 }
6541
6542 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6543 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6544 flat->operands[0] = Operand(addr);
6545 flat->operands[1] = Operand(s1);
6546 flat->operands[2] = Operand(data);
6547 if (return_previous)
6548 flat->definitions[0] = Definition(dst);
6549 flat->glc = return_previous;
6550 flat->dlc = false; /* Not needed for atomics */
6551 flat->offset = 0;
6552 flat->disable_wqm = true;
6553 flat->barrier = barrier_buffer;
6554 ctx->program->needs_exact = true;
6555 ctx->block->instructions.emplace_back(std::move(flat));
6556 } else {
6557 assert(ctx->options->chip_class == GFX6);
6558
6559 switch (instr->intrinsic) {
6560 case nir_intrinsic_global_atomic_add:
6561 op32 = aco_opcode::buffer_atomic_add;
6562 op64 = aco_opcode::buffer_atomic_add_x2;
6563 break;
6564 case nir_intrinsic_global_atomic_imin:
6565 op32 = aco_opcode::buffer_atomic_smin;
6566 op64 = aco_opcode::buffer_atomic_smin_x2;
6567 break;
6568 case nir_intrinsic_global_atomic_umin:
6569 op32 = aco_opcode::buffer_atomic_umin;
6570 op64 = aco_opcode::buffer_atomic_umin_x2;
6571 break;
6572 case nir_intrinsic_global_atomic_imax:
6573 op32 = aco_opcode::buffer_atomic_smax;
6574 op64 = aco_opcode::buffer_atomic_smax_x2;
6575 break;
6576 case nir_intrinsic_global_atomic_umax:
6577 op32 = aco_opcode::buffer_atomic_umax;
6578 op64 = aco_opcode::buffer_atomic_umax_x2;
6579 break;
6580 case nir_intrinsic_global_atomic_and:
6581 op32 = aco_opcode::buffer_atomic_and;
6582 op64 = aco_opcode::buffer_atomic_and_x2;
6583 break;
6584 case nir_intrinsic_global_atomic_or:
6585 op32 = aco_opcode::buffer_atomic_or;
6586 op64 = aco_opcode::buffer_atomic_or_x2;
6587 break;
6588 case nir_intrinsic_global_atomic_xor:
6589 op32 = aco_opcode::buffer_atomic_xor;
6590 op64 = aco_opcode::buffer_atomic_xor_x2;
6591 break;
6592 case nir_intrinsic_global_atomic_exchange:
6593 op32 = aco_opcode::buffer_atomic_swap;
6594 op64 = aco_opcode::buffer_atomic_swap_x2;
6595 break;
6596 case nir_intrinsic_global_atomic_comp_swap:
6597 op32 = aco_opcode::buffer_atomic_cmpswap;
6598 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6599 break;
6600 default:
6601 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6602 }
6603
6604 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6605
6606 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6607
6608 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6609 mubuf->operands[0] = Operand(rsrc);
6610 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6611 mubuf->operands[2] = Operand(0u);
6612 mubuf->operands[3] = Operand(data);
6613 if (return_previous)
6614 mubuf->definitions[0] = Definition(dst);
6615 mubuf->glc = return_previous;
6616 mubuf->dlc = false;
6617 mubuf->offset = 0;
6618 mubuf->addr64 = addr.type() == RegType::vgpr;
6619 mubuf->disable_wqm = true;
6620 mubuf->barrier = barrier_buffer;
6621 ctx->program->needs_exact = true;
6622 ctx->block->instructions.emplace_back(std::move(mubuf));
6623 }
6624 }
6625
6626 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
6627 Builder bld(ctx->program, ctx->block);
6628 switch(instr->intrinsic) {
6629 case nir_intrinsic_group_memory_barrier:
6630 case nir_intrinsic_memory_barrier:
6631 bld.barrier(aco_opcode::p_memory_barrier_common);
6632 break;
6633 case nir_intrinsic_memory_barrier_buffer:
6634 bld.barrier(aco_opcode::p_memory_barrier_buffer);
6635 break;
6636 case nir_intrinsic_memory_barrier_image:
6637 bld.barrier(aco_opcode::p_memory_barrier_image);
6638 break;
6639 case nir_intrinsic_memory_barrier_tcs_patch:
6640 case nir_intrinsic_memory_barrier_shared:
6641 bld.barrier(aco_opcode::p_memory_barrier_shared);
6642 break;
6643 default:
6644 unreachable("Unimplemented memory barrier intrinsic");
6645 break;
6646 }
6647 }
6648
6649 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6650 {
6651 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
6652 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6653 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6654 Builder bld(ctx->program, ctx->block);
6655
6656 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
6657 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6658 load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
6659 }
6660
6661 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6662 {
6663 unsigned writemask = nir_intrinsic_write_mask(instr);
6664 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6665 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6666 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6667
6668 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6669 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
6670 }
6671
6672 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6673 {
6674 unsigned offset = nir_intrinsic_base(instr);
6675 Builder bld(ctx->program, ctx->block);
6676 Operand m = load_lds_size_m0(bld);
6677 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6678 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6679
6680 unsigned num_operands = 3;
6681 aco_opcode op32, op64, op32_rtn, op64_rtn;
6682 switch(instr->intrinsic) {
6683 case nir_intrinsic_shared_atomic_add:
6684 op32 = aco_opcode::ds_add_u32;
6685 op64 = aco_opcode::ds_add_u64;
6686 op32_rtn = aco_opcode::ds_add_rtn_u32;
6687 op64_rtn = aco_opcode::ds_add_rtn_u64;
6688 break;
6689 case nir_intrinsic_shared_atomic_imin:
6690 op32 = aco_opcode::ds_min_i32;
6691 op64 = aco_opcode::ds_min_i64;
6692 op32_rtn = aco_opcode::ds_min_rtn_i32;
6693 op64_rtn = aco_opcode::ds_min_rtn_i64;
6694 break;
6695 case nir_intrinsic_shared_atomic_umin:
6696 op32 = aco_opcode::ds_min_u32;
6697 op64 = aco_opcode::ds_min_u64;
6698 op32_rtn = aco_opcode::ds_min_rtn_u32;
6699 op64_rtn = aco_opcode::ds_min_rtn_u64;
6700 break;
6701 case nir_intrinsic_shared_atomic_imax:
6702 op32 = aco_opcode::ds_max_i32;
6703 op64 = aco_opcode::ds_max_i64;
6704 op32_rtn = aco_opcode::ds_max_rtn_i32;
6705 op64_rtn = aco_opcode::ds_max_rtn_i64;
6706 break;
6707 case nir_intrinsic_shared_atomic_umax:
6708 op32 = aco_opcode::ds_max_u32;
6709 op64 = aco_opcode::ds_max_u64;
6710 op32_rtn = aco_opcode::ds_max_rtn_u32;
6711 op64_rtn = aco_opcode::ds_max_rtn_u64;
6712 break;
6713 case nir_intrinsic_shared_atomic_and:
6714 op32 = aco_opcode::ds_and_b32;
6715 op64 = aco_opcode::ds_and_b64;
6716 op32_rtn = aco_opcode::ds_and_rtn_b32;
6717 op64_rtn = aco_opcode::ds_and_rtn_b64;
6718 break;
6719 case nir_intrinsic_shared_atomic_or:
6720 op32 = aco_opcode::ds_or_b32;
6721 op64 = aco_opcode::ds_or_b64;
6722 op32_rtn = aco_opcode::ds_or_rtn_b32;
6723 op64_rtn = aco_opcode::ds_or_rtn_b64;
6724 break;
6725 case nir_intrinsic_shared_atomic_xor:
6726 op32 = aco_opcode::ds_xor_b32;
6727 op64 = aco_opcode::ds_xor_b64;
6728 op32_rtn = aco_opcode::ds_xor_rtn_b32;
6729 op64_rtn = aco_opcode::ds_xor_rtn_b64;
6730 break;
6731 case nir_intrinsic_shared_atomic_exchange:
6732 op32 = aco_opcode::ds_write_b32;
6733 op64 = aco_opcode::ds_write_b64;
6734 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
6735 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
6736 break;
6737 case nir_intrinsic_shared_atomic_comp_swap:
6738 op32 = aco_opcode::ds_cmpst_b32;
6739 op64 = aco_opcode::ds_cmpst_b64;
6740 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
6741 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
6742 num_operands = 4;
6743 break;
6744 default:
6745 unreachable("Unhandled shared atomic intrinsic");
6746 }
6747
6748 /* return the previous value if dest is ever used */
6749 bool return_previous = false;
6750 nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6751 return_previous = true;
6752 break;
6753 }
6754 nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6755 return_previous = true;
6756 break;
6757 }
6758
6759 aco_opcode op;
6760 if (data.size() == 1) {
6761 assert(instr->dest.ssa.bit_size == 32);
6762 op = return_previous ? op32_rtn : op32;
6763 } else {
6764 assert(instr->dest.ssa.bit_size == 64);
6765 op = return_previous ? op64_rtn : op64;
6766 }
6767
6768 if (offset > 65535) {
6769 address = bld.vadd32(bld.def(v1), Operand(offset), address);
6770 offset = 0;
6771 }
6772
6773 aco_ptr<DS_instruction> ds;
6774 ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
6775 ds->operands[0] = Operand(address);
6776 ds->operands[1] = Operand(data);
6777 if (num_operands == 4)
6778 ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
6779 ds->operands[num_operands - 1] = m;
6780 ds->offset0 = offset;
6781 if (return_previous)
6782 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
6783 ctx->block->instructions.emplace_back(std::move(ds));
6784 }
6785
6786 Temp get_scratch_resource(isel_context *ctx)
6787 {
6788 Builder bld(ctx->program, ctx->block);
6789 Temp scratch_addr = ctx->program->private_segment_buffer;
6790 if (ctx->stage != compute_cs)
6791 scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
6792
6793 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
6794 S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
6795
6796 if (ctx->program->chip_class >= GFX10) {
6797 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
6798 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
6799 S_008F0C_RESOURCE_LEVEL(1);
6800 } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
6801 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6802 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6803 }
6804
6805 /* older generations need element size = 16 bytes. element size removed in GFX9 */
6806 if (ctx->program->chip_class <= GFX8)
6807 rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
6808
6809 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
6810 }
6811
6812 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6813 Builder bld(ctx->program, ctx->block);
6814 Temp rsrc = get_scratch_resource(ctx);
6815 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6816 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6817
6818 LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
6819 instr->dest.ssa.bit_size / 8u, rsrc};
6820 info.align_mul = nir_intrinsic_align_mul(instr);
6821 info.align_offset = nir_intrinsic_align_offset(instr);
6822 info.swizzle_component_size = 16;
6823 info.can_reorder = false;
6824 info.soffset = ctx->program->scratch_offset;
6825 emit_mubuf_load(ctx, bld, &info);
6826 }
6827
6828 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6829 Builder bld(ctx->program, ctx->block);
6830 Temp rsrc = get_scratch_resource(ctx);
6831 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6832 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6833
6834 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6835 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6836
6837 unsigned write_count = 0;
6838 Temp write_datas[32];
6839 unsigned offsets[32];
6840 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6841 16, &write_count, write_datas, offsets);
6842
6843 for (unsigned i = 0; i < write_count; i++) {
6844 aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6845 bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
6846 }
6847 }
6848
6849 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6850 uint8_t log2_ps_iter_samples;
6851 if (ctx->program->info->ps.force_persample) {
6852 log2_ps_iter_samples =
6853 util_logbase2(ctx->options->key.fs.num_samples);
6854 } else {
6855 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6856 }
6857
6858 /* The bit pattern matches that used by fixed function fragment
6859 * processing. */
6860 static const unsigned ps_iter_masks[] = {
6861 0xffff, /* not used */
6862 0x5555,
6863 0x1111,
6864 0x0101,
6865 0x0001,
6866 };
6867 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6868
6869 Builder bld(ctx->program, ctx->block);
6870
6871 Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6872 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6873 Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6874 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6875 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6876 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6877 }
6878
6879 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6880 Builder bld(ctx->program, ctx->block);
6881
6882 unsigned stream = nir_intrinsic_stream_id(instr);
6883 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6884 next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6885 nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6886
6887 /* get GSVS ring */
6888 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6889
6890 unsigned num_components =
6891 ctx->program->info->gs.num_stream_output_components[stream];
6892 assert(num_components);
6893
6894 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6895 unsigned stream_offset = 0;
6896 for (unsigned i = 0; i < stream; i++) {
6897 unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6898 stream_offset += prev_stride * ctx->program->wave_size;
6899 }
6900
6901 /* Limit on the stride field for <= GFX7. */
6902 assert(stride < (1 << 14));
6903
6904 Temp gsvs_dwords[4];
6905 for (unsigned i = 0; i < 4; i++)
6906 gsvs_dwords[i] = bld.tmp(s1);
6907 bld.pseudo(aco_opcode::p_split_vector,
6908 Definition(gsvs_dwords[0]),
6909 Definition(gsvs_dwords[1]),
6910 Definition(gsvs_dwords[2]),
6911 Definition(gsvs_dwords[3]),
6912 gsvs_ring);
6913
6914 if (stream_offset) {
6915 Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6916
6917 Temp carry = bld.tmp(s1);
6918 gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6919 gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6920 }
6921
6922 gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6923 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6924
6925 gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6926 gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6927
6928 unsigned offset = 0;
6929 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6930 if (ctx->program->info->gs.output_streams[i] != stream)
6931 continue;
6932
6933 for (unsigned j = 0; j < 4; j++) {
6934 if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6935 continue;
6936
6937 if (ctx->outputs.mask[i] & (1 << j)) {
6938 Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6939 unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6940 if (const_offset >= 4096u) {
6941 if (vaddr_offset.isUndefined())
6942 vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
6943 else
6944 vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
6945 const_offset %= 4096u;
6946 }
6947
6948 aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
6949 mtbuf->operands[0] = Operand(gsvs_ring);
6950 mtbuf->operands[1] = vaddr_offset;
6951 mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
6952 mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
6953 mtbuf->offen = !vaddr_offset.isUndefined();
6954 mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
6955 mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
6956 mtbuf->offset = const_offset;
6957 mtbuf->glc = true;
6958 mtbuf->slc = true;
6959 mtbuf->barrier = barrier_gs_data;
6960 mtbuf->can_reorder = true;
6961 bld.insert(std::move(mtbuf));
6962 }
6963
6964 offset += ctx->shader->info.gs.vertices_out;
6965 }
6966
6967 /* outputs for the next vertex are undefined and keeping them around can
6968 * create invalid IR with control flow */
6969 ctx->outputs.mask[i] = 0;
6970 }
6971
6972 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
6973 }
6974
6975 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
6976 {
6977 Builder bld(ctx->program, ctx->block);
6978
6979 if (cluster_size == 1) {
6980 return src;
6981 } if (op == nir_op_iand && cluster_size == 4) {
6982 //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
6983 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
6984 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
6985 bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
6986 } else if (op == nir_op_ior && cluster_size == 4) {
6987 //subgroupClusteredOr(val, 4) -> wqm(val & exec)
6988 return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
6989 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
6990 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
6991 //subgroupAnd(val) -> (exec & ~val) == 0
6992 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
6993 Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
6994 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
6995 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
6996 //subgroupOr(val) -> (val & exec) != 0
6997 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
6998 return bool_to_vector_condition(ctx, tmp);
6999 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7000 //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
7001 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7002 tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7003 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
7004 return bool_to_vector_condition(ctx, tmp);
7005 } else {
7006 //subgroupClustered{And,Or,Xor}(val, n) ->
7007 //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ; just v_mbcnt_lo_u32_b32 on wave32
7008 //cluster_offset = ~(n - 1) & lane_id
7009 //cluster_mask = ((1 << n) - 1)
7010 //subgroupClusteredAnd():
7011 // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7012 //subgroupClusteredOr():
7013 // return ((val & exec) >> cluster_offset) & cluster_mask != 0
7014 //subgroupClusteredXor():
7015 // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7016 Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
7017 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
7018
7019 Temp tmp;
7020 if (op == nir_op_iand)
7021 tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7022 else
7023 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7024
7025 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7026
7027 if (ctx->program->chip_class <= GFX7)
7028 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7029 else if (ctx->program->wave_size == 64)
7030 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7031 else
7032 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7033 tmp = emit_extract_vector(ctx, tmp, 0, v1);
7034 if (cluster_mask != 0xffffffff)
7035 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
7036
7037 Definition cmp_def = Definition();
7038 if (op == nir_op_iand) {
7039 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
7040 } else if (op == nir_op_ior) {
7041 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7042 } else if (op == nir_op_ixor) {
7043 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
7044 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
7045 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7046 }
7047 cmp_def.setHint(vcc);
7048 return cmp_def.getTemp();
7049 }
7050 }
7051
7052 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
7053 {
7054 Builder bld(ctx->program, ctx->block);
7055
7056 //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7057 //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7058 //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7059 Temp tmp;
7060 if (op == nir_op_iand)
7061 tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7062 else
7063 tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
7064
7065 Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
7066 Temp lo = lohi.def(0).getTemp();
7067 Temp hi = lohi.def(1).getTemp();
7068 Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
7069
7070 Definition cmp_def = Definition();
7071 if (op == nir_op_iand)
7072 cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7073 else if (op == nir_op_ior)
7074 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7075 else if (op == nir_op_ixor)
7076 cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
7077 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
7078 cmp_def.setHint(vcc);
7079 return cmp_def.getTemp();
7080 }
7081
7082 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
7083 {
7084 Builder bld(ctx->program, ctx->block);
7085
7086 //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7087 //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7088 //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7089 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7090 if (op == nir_op_iand)
7091 return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7092 else if (op == nir_op_ior)
7093 return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7094 else if (op == nir_op_ixor)
7095 return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7096
7097 assert(false);
7098 return Temp();
7099 }
7100
7101 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
7102 {
7103 Builder bld(ctx->program, ctx->block);
7104 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7105 if (src.regClass().type() == RegType::vgpr) {
7106 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7107 } else if (src.regClass() == s1) {
7108 bld.sop1(aco_opcode::s_mov_b32, dst, src);
7109 } else if (src.regClass() == s2) {
7110 bld.sop1(aco_opcode::s_mov_b64, dst, src);
7111 } else {
7112 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7113 nir_print_instr(&instr->instr, stderr);
7114 fprintf(stderr, "\n");
7115 }
7116 }
7117
7118 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
7119 {
7120 Builder bld(ctx->program, ctx->block);
7121 Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7122 Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7123 Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7124
7125 Temp ddx_1, ddx_2, ddy_1, ddy_2;
7126 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7127 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7128 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7129
7130 /* Build DD X/Y */
7131 if (ctx->program->chip_class >= GFX8) {
7132 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7133 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7134 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7135 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7136 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7137 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7138 } else {
7139 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7140 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7141 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7142 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7143 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7144 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7145 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7146 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7147 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7148 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7149 }
7150
7151 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7152 Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
7153 Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
7154 tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
7155 tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
7156 Temp wqm1 = bld.tmp(v1);
7157 emit_wqm(ctx, tmp1, wqm1, true);
7158 Temp wqm2 = bld.tmp(v1);
7159 emit_wqm(ctx, tmp2, wqm2, true);
7160 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7161 return;
7162 }
7163
7164 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
7165 {
7166 Builder bld(ctx->program, ctx->block);
7167 switch(instr->intrinsic) {
7168 case nir_intrinsic_load_barycentric_sample:
7169 case nir_intrinsic_load_barycentric_pixel:
7170 case nir_intrinsic_load_barycentric_centroid: {
7171 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7172 Temp bary = Temp(0, s2);
7173 switch (mode) {
7174 case INTERP_MODE_SMOOTH:
7175 case INTERP_MODE_NONE:
7176 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7177 bary = get_arg(ctx, ctx->args->ac.persp_center);
7178 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7179 bary = ctx->persp_centroid;
7180 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7181 bary = get_arg(ctx, ctx->args->ac.persp_sample);
7182 break;
7183 case INTERP_MODE_NOPERSPECTIVE:
7184 if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7185 bary = get_arg(ctx, ctx->args->ac.linear_center);
7186 else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7187 bary = ctx->linear_centroid;
7188 else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7189 bary = get_arg(ctx, ctx->args->ac.linear_sample);
7190 break;
7191 default:
7192 break;
7193 }
7194 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7195 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7196 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7197 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7198 Operand(p1), Operand(p2));
7199 emit_split_vector(ctx, dst, 2);
7200 break;
7201 }
7202 case nir_intrinsic_load_barycentric_model: {
7203 Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7204
7205 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7206 Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7207 Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7208 Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7209 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7210 Operand(p1), Operand(p2), Operand(p3));
7211 emit_split_vector(ctx, dst, 3);
7212 break;
7213 }
7214 case nir_intrinsic_load_barycentric_at_sample: {
7215 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7216 switch (ctx->options->key.fs.num_samples) {
7217 case 2: sample_pos_offset += 1 << 3; break;
7218 case 4: sample_pos_offset += 3 << 3; break;
7219 case 8: sample_pos_offset += 7 << 3; break;
7220 default: break;
7221 }
7222 Temp sample_pos;
7223 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7224 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7225 Temp private_segment_buffer = ctx->program->private_segment_buffer;
7226 if (addr.type() == RegType::sgpr) {
7227 Operand offset;
7228 if (const_addr) {
7229 sample_pos_offset += const_addr->u32 << 3;
7230 offset = Operand(sample_pos_offset);
7231 } else if (ctx->options->chip_class >= GFX9) {
7232 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7233 } else {
7234 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
7235 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7236 }
7237
7238 Operand off = bld.copy(bld.def(s1), Operand(offset));
7239 sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7240
7241 } else if (ctx->options->chip_class >= GFX9) {
7242 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7243 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
7244 } else if (ctx->options->chip_class >= GFX7) {
7245 /* addr += private_segment_buffer + sample_pos_offset */
7246 Temp tmp0 = bld.tmp(s1);
7247 Temp tmp1 = bld.tmp(s1);
7248 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
7249 Definition scc_tmp = bld.def(s1, scc);
7250 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
7251 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
7252 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7253 Temp pck0 = bld.tmp(v1);
7254 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7255 tmp1 = as_vgpr(ctx, tmp1);
7256 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
7257 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7258
7259 /* sample_pos = flat_load_dwordx2 addr */
7260 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7261 } else {
7262 assert(ctx->options->chip_class == GFX6);
7263
7264 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7265 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7266 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
7267
7268 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7269 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
7270
7271 sample_pos = bld.tmp(v2);
7272
7273 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
7274 load->definitions[0] = Definition(sample_pos);
7275 load->operands[0] = Operand(rsrc);
7276 load->operands[1] = Operand(addr);
7277 load->operands[2] = Operand(0u);
7278 load->offset = sample_pos_offset;
7279 load->offen = 0;
7280 load->addr64 = true;
7281 load->glc = false;
7282 load->dlc = false;
7283 load->disable_wqm = false;
7284 load->barrier = barrier_none;
7285 load->can_reorder = true;
7286 ctx->block->instructions.emplace_back(std::move(load));
7287 }
7288
7289 /* sample_pos -= 0.5 */
7290 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
7291 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
7292 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
7293 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
7294 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
7295
7296 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7297 break;
7298 }
7299 case nir_intrinsic_load_barycentric_at_offset: {
7300 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
7301 RegClass rc = RegClass(offset.type(), 1);
7302 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
7303 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
7304 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7305 break;
7306 }
7307 case nir_intrinsic_load_front_face: {
7308 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7309 Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
7310 break;
7311 }
7312 case nir_intrinsic_load_view_index: {
7313 if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
7314 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7315 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
7316 break;
7317 }
7318
7319 /* fallthrough */
7320 }
7321 case nir_intrinsic_load_layer_id: {
7322 unsigned idx = nir_intrinsic_base(instr);
7323 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7324 Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
7325 break;
7326 }
7327 case nir_intrinsic_load_frag_coord: {
7328 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
7329 break;
7330 }
7331 case nir_intrinsic_load_sample_pos: {
7332 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
7333 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
7334 bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7335 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
7336 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
7337 break;
7338 }
7339 case nir_intrinsic_load_tess_coord:
7340 visit_load_tess_coord(ctx, instr);
7341 break;
7342 case nir_intrinsic_load_interpolated_input:
7343 visit_load_interpolated_input(ctx, instr);
7344 break;
7345 case nir_intrinsic_store_output:
7346 visit_store_output(ctx, instr);
7347 break;
7348 case nir_intrinsic_load_input:
7349 case nir_intrinsic_load_input_vertex:
7350 visit_load_input(ctx, instr);
7351 break;
7352 case nir_intrinsic_load_output:
7353 visit_load_output(ctx, instr);
7354 break;
7355 case nir_intrinsic_load_per_vertex_input:
7356 visit_load_per_vertex_input(ctx, instr);
7357 break;
7358 case nir_intrinsic_load_per_vertex_output:
7359 visit_load_per_vertex_output(ctx, instr);
7360 break;
7361 case nir_intrinsic_store_per_vertex_output:
7362 visit_store_per_vertex_output(ctx, instr);
7363 break;
7364 case nir_intrinsic_load_ubo:
7365 visit_load_ubo(ctx, instr);
7366 break;
7367 case nir_intrinsic_load_push_constant:
7368 visit_load_push_constant(ctx, instr);
7369 break;
7370 case nir_intrinsic_load_constant:
7371 visit_load_constant(ctx, instr);
7372 break;
7373 case nir_intrinsic_vulkan_resource_index:
7374 visit_load_resource(ctx, instr);
7375 break;
7376 case nir_intrinsic_discard:
7377 visit_discard(ctx, instr);
7378 break;
7379 case nir_intrinsic_discard_if:
7380 visit_discard_if(ctx, instr);
7381 break;
7382 case nir_intrinsic_load_shared:
7383 visit_load_shared(ctx, instr);
7384 break;
7385 case nir_intrinsic_store_shared:
7386 visit_store_shared(ctx, instr);
7387 break;
7388 case nir_intrinsic_shared_atomic_add:
7389 case nir_intrinsic_shared_atomic_imin:
7390 case nir_intrinsic_shared_atomic_umin:
7391 case nir_intrinsic_shared_atomic_imax:
7392 case nir_intrinsic_shared_atomic_umax:
7393 case nir_intrinsic_shared_atomic_and:
7394 case nir_intrinsic_shared_atomic_or:
7395 case nir_intrinsic_shared_atomic_xor:
7396 case nir_intrinsic_shared_atomic_exchange:
7397 case nir_intrinsic_shared_atomic_comp_swap:
7398 visit_shared_atomic(ctx, instr);
7399 break;
7400 case nir_intrinsic_image_deref_load:
7401 visit_image_load(ctx, instr);
7402 break;
7403 case nir_intrinsic_image_deref_store:
7404 visit_image_store(ctx, instr);
7405 break;
7406 case nir_intrinsic_image_deref_atomic_add:
7407 case nir_intrinsic_image_deref_atomic_umin:
7408 case nir_intrinsic_image_deref_atomic_imin:
7409 case nir_intrinsic_image_deref_atomic_umax:
7410 case nir_intrinsic_image_deref_atomic_imax:
7411 case nir_intrinsic_image_deref_atomic_and:
7412 case nir_intrinsic_image_deref_atomic_or:
7413 case nir_intrinsic_image_deref_atomic_xor:
7414 case nir_intrinsic_image_deref_atomic_exchange:
7415 case nir_intrinsic_image_deref_atomic_comp_swap:
7416 visit_image_atomic(ctx, instr);
7417 break;
7418 case nir_intrinsic_image_deref_size:
7419 visit_image_size(ctx, instr);
7420 break;
7421 case nir_intrinsic_load_ssbo:
7422 visit_load_ssbo(ctx, instr);
7423 break;
7424 case nir_intrinsic_store_ssbo:
7425 visit_store_ssbo(ctx, instr);
7426 break;
7427 case nir_intrinsic_load_global:
7428 visit_load_global(ctx, instr);
7429 break;
7430 case nir_intrinsic_store_global:
7431 visit_store_global(ctx, instr);
7432 break;
7433 case nir_intrinsic_global_atomic_add:
7434 case nir_intrinsic_global_atomic_imin:
7435 case nir_intrinsic_global_atomic_umin:
7436 case nir_intrinsic_global_atomic_imax:
7437 case nir_intrinsic_global_atomic_umax:
7438 case nir_intrinsic_global_atomic_and:
7439 case nir_intrinsic_global_atomic_or:
7440 case nir_intrinsic_global_atomic_xor:
7441 case nir_intrinsic_global_atomic_exchange:
7442 case nir_intrinsic_global_atomic_comp_swap:
7443 visit_global_atomic(ctx, instr);
7444 break;
7445 case nir_intrinsic_ssbo_atomic_add:
7446 case nir_intrinsic_ssbo_atomic_imin:
7447 case nir_intrinsic_ssbo_atomic_umin:
7448 case nir_intrinsic_ssbo_atomic_imax:
7449 case nir_intrinsic_ssbo_atomic_umax:
7450 case nir_intrinsic_ssbo_atomic_and:
7451 case nir_intrinsic_ssbo_atomic_or:
7452 case nir_intrinsic_ssbo_atomic_xor:
7453 case nir_intrinsic_ssbo_atomic_exchange:
7454 case nir_intrinsic_ssbo_atomic_comp_swap:
7455 visit_atomic_ssbo(ctx, instr);
7456 break;
7457 case nir_intrinsic_load_scratch:
7458 visit_load_scratch(ctx, instr);
7459 break;
7460 case nir_intrinsic_store_scratch:
7461 visit_store_scratch(ctx, instr);
7462 break;
7463 case nir_intrinsic_get_buffer_size:
7464 visit_get_buffer_size(ctx, instr);
7465 break;
7466 case nir_intrinsic_control_barrier: {
7467 if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7468 /* GFX6 only (thanks to a hw bug workaround):
7469 * The real barrier instruction isn’t needed, because an entire patch
7470 * always fits into a single wave.
7471 */
7472 break;
7473 }
7474
7475 if (ctx->program->workgroup_size > ctx->program->wave_size)
7476 bld.sopp(aco_opcode::s_barrier);
7477
7478 break;
7479 }
7480 case nir_intrinsic_memory_barrier_tcs_patch:
7481 case nir_intrinsic_group_memory_barrier:
7482 case nir_intrinsic_memory_barrier:
7483 case nir_intrinsic_memory_barrier_buffer:
7484 case nir_intrinsic_memory_barrier_image:
7485 case nir_intrinsic_memory_barrier_shared:
7486 emit_memory_barrier(ctx, instr);
7487 break;
7488 case nir_intrinsic_load_num_work_groups: {
7489 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7490 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
7491 emit_split_vector(ctx, dst, 3);
7492 break;
7493 }
7494 case nir_intrinsic_load_local_invocation_id: {
7495 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7496 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
7497 emit_split_vector(ctx, dst, 3);
7498 break;
7499 }
7500 case nir_intrinsic_load_work_group_id: {
7501 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7502 struct ac_arg *args = ctx->args->ac.workgroup_ids;
7503 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7504 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
7505 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
7506 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
7507 emit_split_vector(ctx, dst, 3);
7508 break;
7509 }
7510 case nir_intrinsic_load_local_invocation_index: {
7511 Temp id = emit_mbcnt(ctx, bld.def(v1));
7512
7513 /* The tg_size bits [6:11] contain the subgroup id,
7514 * we need this multiplied by the wave size, and then OR the thread id to it.
7515 */
7516 if (ctx->program->wave_size == 64) {
7517 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
7518 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
7519 get_arg(ctx, ctx->args->ac.tg_size));
7520 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
7521 } else {
7522 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
7523 Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7524 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7525 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
7526 }
7527 break;
7528 }
7529 case nir_intrinsic_load_subgroup_id: {
7530 if (ctx->stage == compute_cs) {
7531 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
7532 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7533 } else {
7534 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
7535 }
7536 break;
7537 }
7538 case nir_intrinsic_load_subgroup_invocation: {
7539 emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
7540 break;
7541 }
7542 case nir_intrinsic_load_num_subgroups: {
7543 if (ctx->stage == compute_cs)
7544 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
7545 get_arg(ctx, ctx->args->ac.tg_size));
7546 else
7547 bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
7548 break;
7549 }
7550 case nir_intrinsic_ballot: {
7551 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7552 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7553 Definition tmp = bld.def(dst.regClass());
7554 Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
7555 if (instr->src[0].ssa->bit_size == 1) {
7556 assert(src.regClass() == bld.lm);
7557 bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
7558 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
7559 bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
7560 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
7561 bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
7562 } else {
7563 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7564 nir_print_instr(&instr->instr, stderr);
7565 fprintf(stderr, "\n");
7566 }
7567 if (dst.size() != bld.lm.size()) {
7568 /* Wave32 with ballot size set to 64 */
7569 bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
7570 }
7571 emit_wqm(ctx, tmp.getTemp(), dst);
7572 break;
7573 }
7574 case nir_intrinsic_shuffle:
7575 case nir_intrinsic_read_invocation: {
7576 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7577 if (!nir_src_is_divergent(instr->src[0])) {
7578 emit_uniform_subgroup(ctx, instr, src);
7579 } else {
7580 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
7581 if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
7582 tid = bld.as_uniform(tid);
7583 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7584 if (src.regClass() == v1b || src.regClass() == v2b) {
7585 Temp tmp = bld.tmp(v1);
7586 tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
7587 if (dst.type() == RegType::vgpr)
7588 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
7589 else
7590 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
7591 } else if (src.regClass() == v1) {
7592 emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
7593 } else if (src.regClass() == v2) {
7594 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7595 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7596 lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
7597 hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
7598 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7599 emit_split_vector(ctx, dst, 2);
7600 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
7601 assert(src.regClass() == bld.lm);
7602 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
7603 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7604 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
7605 assert(src.regClass() == bld.lm);
7606 Temp tmp;
7607 if (ctx->program->chip_class <= GFX7)
7608 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
7609 else if (ctx->program->wave_size == 64)
7610 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
7611 else
7612 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
7613 tmp = emit_extract_vector(ctx, tmp, 0, v1);
7614 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
7615 emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
7616 } else {
7617 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7618 nir_print_instr(&instr->instr, stderr);
7619 fprintf(stderr, "\n");
7620 }
7621 }
7622 break;
7623 }
7624 case nir_intrinsic_load_sample_id: {
7625 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7626 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
7627 break;
7628 }
7629 case nir_intrinsic_load_sample_mask_in: {
7630 visit_load_sample_mask_in(ctx, instr);
7631 break;
7632 }
7633 case nir_intrinsic_read_first_invocation: {
7634 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7635 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7636 if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
7637 emit_wqm(ctx,
7638 bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
7639 dst);
7640 } else if (src.regClass() == v2) {
7641 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7642 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7643 lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
7644 hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
7645 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7646 emit_split_vector(ctx, dst, 2);
7647 } else if (instr->dest.ssa.bit_size == 1) {
7648 assert(src.regClass() == bld.lm);
7649 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
7650 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
7651 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7652 } else if (src.regClass() == s1) {
7653 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
7654 } else if (src.regClass() == s2) {
7655 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
7656 } else {
7657 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7658 nir_print_instr(&instr->instr, stderr);
7659 fprintf(stderr, "\n");
7660 }
7661 break;
7662 }
7663 case nir_intrinsic_vote_all: {
7664 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7665 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7666 assert(src.regClass() == bld.lm);
7667 assert(dst.regClass() == bld.lm);
7668
7669 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7670 Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7671 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7672 break;
7673 }
7674 case nir_intrinsic_vote_any: {
7675 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7676 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7677 assert(src.regClass() == bld.lm);
7678 assert(dst.regClass() == bld.lm);
7679
7680 Temp tmp = bool_to_scalar_condition(ctx, src);
7681 bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7682 break;
7683 }
7684 case nir_intrinsic_reduce:
7685 case nir_intrinsic_inclusive_scan:
7686 case nir_intrinsic_exclusive_scan: {
7687 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7688 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7689 nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
7690 unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
7691 nir_intrinsic_cluster_size(instr) : 0;
7692 cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
7693
7694 if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
7695 emit_uniform_subgroup(ctx, instr, src);
7696 } else if (instr->dest.ssa.bit_size == 1) {
7697 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
7698 op = nir_op_iand;
7699 else if (op == nir_op_iadd)
7700 op = nir_op_ixor;
7701 else if (op == nir_op_umax || op == nir_op_imax)
7702 op = nir_op_ior;
7703 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
7704
7705 switch (instr->intrinsic) {
7706 case nir_intrinsic_reduce:
7707 emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
7708 break;
7709 case nir_intrinsic_exclusive_scan:
7710 emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
7711 break;
7712 case nir_intrinsic_inclusive_scan:
7713 emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
7714 break;
7715 default:
7716 assert(false);
7717 }
7718 } else if (cluster_size == 1) {
7719 bld.copy(Definition(dst), src);
7720 } else {
7721 unsigned bit_size = instr->src[0].ssa->bit_size;
7722
7723 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
7724
7725 ReduceOp reduce_op;
7726 switch (op) {
7727 #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
7728 #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
7729 CASEI(iadd)
7730 CASEI(imul)
7731 CASEI(imin)
7732 CASEI(umin)
7733 CASEI(imax)
7734 CASEI(umax)
7735 CASEI(iand)
7736 CASEI(ior)
7737 CASEI(ixor)
7738 CASEF(fadd)
7739 CASEF(fmul)
7740 CASEF(fmin)
7741 CASEF(fmax)
7742 default:
7743 unreachable("unknown reduction op");
7744 #undef CASEI
7745 #undef CASEF
7746 }
7747
7748 aco_opcode aco_op;
7749 switch (instr->intrinsic) {
7750 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7751 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7752 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7753 default:
7754 unreachable("unknown reduce intrinsic");
7755 }
7756
7757 aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7758 reduce->operands[0] = Operand(src);
7759 // filled in by aco_reduce_assign.cpp, used internally as part of the
7760 // reduce sequence
7761 assert(dst.size() == 1 || dst.size() == 2);
7762 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7763 reduce->operands[2] = Operand(v1.as_linear());
7764
7765 Temp tmp_dst = bld.tmp(dst.regClass());
7766 reduce->definitions[0] = Definition(tmp_dst);
7767 reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7768 reduce->definitions[2] = Definition();
7769 reduce->definitions[3] = Definition(scc, s1);
7770 reduce->definitions[4] = Definition();
7771 reduce->reduce_op = reduce_op;
7772 reduce->cluster_size = cluster_size;
7773 ctx->block->instructions.emplace_back(std::move(reduce));
7774
7775 emit_wqm(ctx, tmp_dst, dst);
7776 }
7777 break;
7778 }
7779 case nir_intrinsic_quad_broadcast: {
7780 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7781 if (!nir_dest_is_divergent(instr->dest)) {
7782 emit_uniform_subgroup(ctx, instr, src);
7783 } else {
7784 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7785 unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7786 uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7787
7788 if (instr->dest.ssa.bit_size == 1) {
7789 assert(src.regClass() == bld.lm);
7790 assert(dst.regClass() == bld.lm);
7791 uint32_t half_mask = 0x11111111u << lane;
7792 Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7793 Temp tmp = bld.tmp(bld.lm);
7794 bld.sop1(Builder::s_wqm, Definition(tmp),
7795 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7796 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7797 emit_wqm(ctx, tmp, dst);
7798 } else if (instr->dest.ssa.bit_size == 8) {
7799 Temp tmp = bld.tmp(v1);
7800 if (ctx->program->chip_class >= GFX8)
7801 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7802 else
7803 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7804 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7805 } else if (instr->dest.ssa.bit_size == 16) {
7806 Temp tmp = bld.tmp(v1);
7807 if (ctx->program->chip_class >= GFX8)
7808 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7809 else
7810 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7811 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7812 } else if (instr->dest.ssa.bit_size == 32) {
7813 if (ctx->program->chip_class >= GFX8)
7814 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7815 else
7816 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7817 } else if (instr->dest.ssa.bit_size == 64) {
7818 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7819 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7820 if (ctx->program->chip_class >= GFX8) {
7821 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7822 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7823 } else {
7824 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7825 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7826 }
7827 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7828 emit_split_vector(ctx, dst, 2);
7829 } else {
7830 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7831 nir_print_instr(&instr->instr, stderr);
7832 fprintf(stderr, "\n");
7833 }
7834 }
7835 break;
7836 }
7837 case nir_intrinsic_quad_swap_horizontal:
7838 case nir_intrinsic_quad_swap_vertical:
7839 case nir_intrinsic_quad_swap_diagonal:
7840 case nir_intrinsic_quad_swizzle_amd: {
7841 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7842 if (!nir_dest_is_divergent(instr->dest)) {
7843 emit_uniform_subgroup(ctx, instr, src);
7844 break;
7845 }
7846 uint16_t dpp_ctrl = 0;
7847 switch (instr->intrinsic) {
7848 case nir_intrinsic_quad_swap_horizontal:
7849 dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7850 break;
7851 case nir_intrinsic_quad_swap_vertical:
7852 dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7853 break;
7854 case nir_intrinsic_quad_swap_diagonal:
7855 dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7856 break;
7857 case nir_intrinsic_quad_swizzle_amd:
7858 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7859 break;
7860 default:
7861 break;
7862 }
7863 if (ctx->program->chip_class < GFX8)
7864 dpp_ctrl |= (1 << 15);
7865
7866 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7867 if (instr->dest.ssa.bit_size == 1) {
7868 assert(src.regClass() == bld.lm);
7869 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7870 if (ctx->program->chip_class >= GFX8)
7871 src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7872 else
7873 src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7874 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7875 emit_wqm(ctx, tmp, dst);
7876 } else if (instr->dest.ssa.bit_size == 8) {
7877 Temp tmp = bld.tmp(v1);
7878 if (ctx->program->chip_class >= GFX8)
7879 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7880 else
7881 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7882 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7883 } else if (instr->dest.ssa.bit_size == 16) {
7884 Temp tmp = bld.tmp(v1);
7885 if (ctx->program->chip_class >= GFX8)
7886 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7887 else
7888 emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7889 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7890 } else if (instr->dest.ssa.bit_size == 32) {
7891 Temp tmp;
7892 if (ctx->program->chip_class >= GFX8)
7893 tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7894 else
7895 tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7896 emit_wqm(ctx, tmp, dst);
7897 } else if (instr->dest.ssa.bit_size == 64) {
7898 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7899 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7900 if (ctx->program->chip_class >= GFX8) {
7901 lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7902 hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7903 } else {
7904 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7905 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7906 }
7907 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7908 emit_split_vector(ctx, dst, 2);
7909 } else {
7910 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7911 nir_print_instr(&instr->instr, stderr);
7912 fprintf(stderr, "\n");
7913 }
7914 break;
7915 }
7916 case nir_intrinsic_masked_swizzle_amd: {
7917 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7918 if (!nir_dest_is_divergent(instr->dest)) {
7919 emit_uniform_subgroup(ctx, instr, src);
7920 break;
7921 }
7922 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7923 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7924 if (dst.regClass() == v1) {
7925 emit_wqm(ctx,
7926 bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
7927 dst);
7928 } else if (dst.regClass() == v2) {
7929 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7930 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7931 lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
7932 hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
7933 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7934 emit_split_vector(ctx, dst, 2);
7935 } else {
7936 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7937 nir_print_instr(&instr->instr, stderr);
7938 fprintf(stderr, "\n");
7939 }
7940 break;
7941 }
7942 case nir_intrinsic_write_invocation_amd: {
7943 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7944 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7945 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
7946 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7947 if (dst.regClass() == v1) {
7948 /* src2 is ignored for writelane. RA assigns the same reg for dst */
7949 emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
7950 } else if (dst.regClass() == v2) {
7951 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
7952 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
7953 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
7954 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
7955 Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
7956 Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
7957 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7958 emit_split_vector(ctx, dst, 2);
7959 } else {
7960 fprintf(stderr, "Unimplemented NIR instr bit size: ");
7961 nir_print_instr(&instr->instr, stderr);
7962 fprintf(stderr, "\n");
7963 }
7964 break;
7965 }
7966 case nir_intrinsic_mbcnt_amd: {
7967 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7968 RegClass rc = RegClass(src.type(), 1);
7969 Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
7970 bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
7971 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7972 Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
7973 emit_wqm(ctx, wqm_tmp, dst);
7974 break;
7975 }
7976 case nir_intrinsic_load_helper_invocation: {
7977 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7978 bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
7979 ctx->block->kind |= block_kind_needs_lowering;
7980 ctx->program->needs_exact = true;
7981 break;
7982 }
7983 case nir_intrinsic_is_helper_invocation: {
7984 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7985 bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
7986 ctx->block->kind |= block_kind_needs_lowering;
7987 ctx->program->needs_exact = true;
7988 break;
7989 }
7990 case nir_intrinsic_demote:
7991 bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
7992
7993 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
7994 ctx->cf_info.exec_potentially_empty_discard = true;
7995 ctx->block->kind |= block_kind_uses_demote;
7996 ctx->program->needs_exact = true;
7997 break;
7998 case nir_intrinsic_demote_if: {
7999 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8000 assert(src.regClass() == bld.lm);
8001 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8002 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8003
8004 if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8005 ctx->cf_info.exec_potentially_empty_discard = true;
8006 ctx->block->kind |= block_kind_uses_demote;
8007 ctx->program->needs_exact = true;
8008 break;
8009 }
8010 case nir_intrinsic_first_invocation: {
8011 emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8012 get_ssa_temp(ctx, &instr->dest.ssa));
8013 break;
8014 }
8015 case nir_intrinsic_shader_clock: {
8016 aco_opcode opcode =
8017 nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
8018 aco_opcode::s_memrealtime : aco_opcode::s_memtime;
8019 bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
8020 emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
8021 break;
8022 }
8023 case nir_intrinsic_load_vertex_id_zero_base: {
8024 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8025 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8026 break;
8027 }
8028 case nir_intrinsic_load_first_vertex: {
8029 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8030 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8031 break;
8032 }
8033 case nir_intrinsic_load_base_instance: {
8034 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8035 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8036 break;
8037 }
8038 case nir_intrinsic_load_instance_id: {
8039 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8040 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8041 break;
8042 }
8043 case nir_intrinsic_load_draw_id: {
8044 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8045 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8046 break;
8047 }
8048 case nir_intrinsic_load_invocation_id: {
8049 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8050
8051 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8052 if (ctx->options->chip_class >= GFX10)
8053 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8054 else
8055 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8056 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8057 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
8058 get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
8059 } else {
8060 unreachable("Unsupported stage for load_invocation_id");
8061 }
8062
8063 break;
8064 }
8065 case nir_intrinsic_load_primitive_id: {
8066 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8067
8068 switch (ctx->shader->info.stage) {
8069 case MESA_SHADER_GEOMETRY:
8070 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8071 break;
8072 case MESA_SHADER_TESS_CTRL:
8073 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8074 break;
8075 case MESA_SHADER_TESS_EVAL:
8076 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8077 break;
8078 default:
8079 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8080 }
8081
8082 break;
8083 }
8084 case nir_intrinsic_load_patch_vertices_in: {
8085 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8086 ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8087
8088 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8089 bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
8090 break;
8091 }
8092 case nir_intrinsic_emit_vertex_with_counter: {
8093 visit_emit_vertex_with_counter(ctx, instr);
8094 break;
8095 }
8096 case nir_intrinsic_end_primitive_with_counter: {
8097 unsigned stream = nir_intrinsic_stream_id(instr);
8098 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
8099 break;
8100 }
8101 case nir_intrinsic_set_vertex_count: {
8102 /* unused, the HW keeps track of this for us */
8103 break;
8104 }
8105 default:
8106 fprintf(stderr, "Unimplemented intrinsic instr: ");
8107 nir_print_instr(&instr->instr, stderr);
8108 fprintf(stderr, "\n");
8109 abort();
8110
8111 break;
8112 }
8113 }
8114
8115
8116 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
8117 Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
8118 enum glsl_base_type *stype)
8119 {
8120 nir_deref_instr *texture_deref_instr = NULL;
8121 nir_deref_instr *sampler_deref_instr = NULL;
8122 int plane = -1;
8123
8124 for (unsigned i = 0; i < instr->num_srcs; i++) {
8125 switch (instr->src[i].src_type) {
8126 case nir_tex_src_texture_deref:
8127 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
8128 break;
8129 case nir_tex_src_sampler_deref:
8130 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
8131 break;
8132 case nir_tex_src_plane:
8133 plane = nir_src_as_int(instr->src[i].src);
8134 break;
8135 default:
8136 break;
8137 }
8138 }
8139
8140 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
8141
8142 if (!sampler_deref_instr)
8143 sampler_deref_instr = texture_deref_instr;
8144
8145 if (plane >= 0) {
8146 assert(instr->op != nir_texop_txf_ms &&
8147 instr->op != nir_texop_samples_identical);
8148 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
8149 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
8150 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8151 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
8152 } else if (instr->op == nir_texop_fragment_mask_fetch) {
8153 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8154 } else {
8155 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
8156 }
8157 if (samp_ptr) {
8158 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
8159
8160 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
8161 /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
8162 Builder bld(ctx->program, ctx->block);
8163
8164 /* to avoid unnecessary moves, we split and recombine sampler and image */
8165 Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
8166 bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8167 Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8168 bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
8169 Definition(img[2]), Definition(img[3]), Definition(img[4]),
8170 Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
8171 bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
8172 Definition(samp[2]), Definition(samp[3]), *samp_ptr);
8173
8174 samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
8175 *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
8176 img[0], img[1], img[2], img[3],
8177 img[4], img[5], img[6], img[7]);
8178 *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
8179 samp[0], samp[1], samp[2], samp[3]);
8180 }
8181 }
8182 if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
8183 instr->op == nir_texop_samples_identical))
8184 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8185 }
8186
8187 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
8188 Temp *out_ma, Temp *out_sc, Temp *out_tc)
8189 {
8190 Builder bld(ctx->program, ctx->block);
8191
8192 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
8193 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
8194 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
8195
8196 Operand neg_one(0xbf800000u);
8197 Operand one(0x3f800000u);
8198 Operand two(0x40000000u);
8199 Operand four(0x40800000u);
8200
8201 Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
8202 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
8203 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
8204
8205 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
8206 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
8207 is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
8208 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
8209
8210 // select sc
8211 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
8212 Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
8213 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
8214 one, is_ma_y);
8215 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8216
8217 // select tc
8218 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
8219 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
8220 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8221
8222 // select ma
8223 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8224 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
8225 deriv_z, is_ma_z);
8226 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
8227 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
8228 }
8229
8230 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
8231 {
8232 Builder bld(ctx->program, ctx->block);
8233 Temp ma, tc, sc, id;
8234
8235 if (is_array) {
8236 coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
8237
8238 // see comment in ac_prepare_cube_coords()
8239 if (ctx->options->chip_class <= GFX8)
8240 coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
8241 }
8242
8243 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8244
8245 aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
8246 vop3a->operands[0] = Operand(ma);
8247 vop3a->abs[0] = true;
8248 Temp invma = bld.tmp(v1);
8249 vop3a->definitions[0] = Definition(invma);
8250 ctx->block->instructions.emplace_back(std::move(vop3a));
8251
8252 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8253 if (!is_deriv)
8254 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
8255
8256 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8257 if (!is_deriv)
8258 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
8259
8260 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8261
8262 if (is_deriv) {
8263 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
8264 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
8265
8266 for (unsigned i = 0; i < 2; i++) {
8267 // see comment in ac_prepare_cube_coords()
8268 Temp deriv_ma;
8269 Temp deriv_sc, deriv_tc;
8270 build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
8271 &deriv_ma, &deriv_sc, &deriv_tc);
8272
8273 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
8274
8275 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8276 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
8277 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
8278 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8279 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
8280 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
8281 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
8282 }
8283
8284 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
8285 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
8286 }
8287
8288 if (is_array)
8289 id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
8290 coords.resize(3);
8291 coords[0] = sc;
8292 coords[1] = tc;
8293 coords[2] = id;
8294 }
8295
8296 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
8297 {
8298 if (vec->parent_instr->type != nir_instr_type_alu)
8299 return;
8300 nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
8301 if (vec_instr->op != nir_op_vec(vec->num_components))
8302 return;
8303
8304 for (unsigned i = 0; i < vec->num_components; i++) {
8305 cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
8306 nir_src_as_const_value(vec_instr->src[i].src) : NULL;
8307 }
8308 }
8309
8310 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
8311 {
8312 Builder bld(ctx->program, ctx->block);
8313 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
8314 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
8315 has_clamped_lod = false;
8316 Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
8317 lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
8318 clamped_lod = Temp();
8319 std::vector<Temp> coords;
8320 std::vector<Temp> derivs;
8321 nir_const_value *sample_index_cv = NULL;
8322 nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
8323 enum glsl_base_type stype;
8324 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
8325
8326 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
8327 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
8328 bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
8329 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
8330
8331 for (unsigned i = 0; i < instr->num_srcs; i++) {
8332 switch (instr->src[i].src_type) {
8333 case nir_tex_src_coord: {
8334 Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
8335 for (unsigned i = 0; i < coord.size(); i++)
8336 coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
8337 break;
8338 }
8339 case nir_tex_src_bias:
8340 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
8341 has_bias = true;
8342 break;
8343 case nir_tex_src_lod: {
8344 nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
8345
8346 if (val && val->f32 <= 0.0) {
8347 level_zero = true;
8348 } else {
8349 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8350 has_lod = true;
8351 }
8352 break;
8353 }
8354 case nir_tex_src_min_lod:
8355 clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8356 has_clamped_lod = true;
8357 break;
8358 case nir_tex_src_comparator:
8359 if (instr->is_shadow) {
8360 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
8361 has_compare = true;
8362 }
8363 break;
8364 case nir_tex_src_offset:
8365 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
8366 get_const_vec(instr->src[i].src.ssa, const_offset);
8367 has_offset = true;
8368 break;
8369 case nir_tex_src_ddx:
8370 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
8371 has_ddx = true;
8372 break;
8373 case nir_tex_src_ddy:
8374 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
8375 has_ddy = true;
8376 break;
8377 case nir_tex_src_ms_index:
8378 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
8379 sample_index_cv = nir_src_as_const_value(instr->src[i].src);
8380 has_sample_index = true;
8381 break;
8382 case nir_tex_src_texture_offset:
8383 case nir_tex_src_sampler_offset:
8384 default:
8385 break;
8386 }
8387 }
8388
8389 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
8390 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
8391
8392 if (instr->op == nir_texop_texture_samples) {
8393 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
8394
8395 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
8396 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
8397 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
8398
8399 Operand default_sample = Operand(1u);
8400 if (ctx->options->robust_buffer_access) {
8401 /* Extract the second dword of the descriptor, if it's
8402 * all zero, then it's a null descriptor.
8403 */
8404 Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
8405 Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
8406 default_sample = Operand(is_non_null_descriptor);
8407 }
8408
8409 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
8410 bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8411 samples, default_sample, bld.scc(is_msaa));
8412 return;
8413 }
8414
8415 if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
8416 aco_ptr<Instruction> tmp_instr;
8417 Temp acc, pack = Temp();
8418
8419 uint32_t pack_const = 0;
8420 for (unsigned i = 0; i < offset.size(); i++) {
8421 if (!const_offset[i])
8422 continue;
8423 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
8424 }
8425
8426 if (offset.type() == RegType::sgpr) {
8427 for (unsigned i = 0; i < offset.size(); i++) {
8428 if (const_offset[i])
8429 continue;
8430
8431 acc = emit_extract_vector(ctx, offset, i, s1);
8432 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
8433
8434 if (i) {
8435 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
8436 }
8437
8438 if (pack == Temp()) {
8439 pack = acc;
8440 } else {
8441 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
8442 }
8443 }
8444
8445 if (pack_const && pack != Temp())
8446 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
8447 } else {
8448 for (unsigned i = 0; i < offset.size(); i++) {
8449 if (const_offset[i])
8450 continue;
8451
8452 acc = emit_extract_vector(ctx, offset, i, v1);
8453 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
8454
8455 if (i) {
8456 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
8457 }
8458
8459 if (pack == Temp()) {
8460 pack = acc;
8461 } else {
8462 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
8463 }
8464 }
8465
8466 if (pack_const && pack != Temp())
8467 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
8468 }
8469 if (pack_const && pack == Temp())
8470 offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
8471 else if (pack == Temp())
8472 has_offset = false;
8473 else
8474 offset = pack;
8475 }
8476
8477 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
8478 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
8479
8480 /* pack derivatives */
8481 if (has_ddx || has_ddy) {
8482 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
8483 assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
8484 Temp zero = bld.copy(bld.def(v1), Operand(0u));
8485 derivs = {ddx, zero, ddy, zero};
8486 } else {
8487 for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
8488 derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
8489 for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
8490 derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
8491 }
8492 has_derivs = true;
8493 }
8494
8495 if (instr->coord_components > 1 &&
8496 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8497 instr->is_array &&
8498 instr->op != nir_texop_txf)
8499 coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
8500
8501 if (instr->coord_components > 2 &&
8502 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
8503 instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8504 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
8505 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8506 instr->is_array &&
8507 instr->op != nir_texop_txf &&
8508 instr->op != nir_texop_txf_ms &&
8509 instr->op != nir_texop_fragment_fetch &&
8510 instr->op != nir_texop_fragment_mask_fetch)
8511 coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
8512
8513 if (ctx->options->chip_class == GFX9 &&
8514 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8515 instr->op != nir_texop_lod && instr->coord_components) {
8516 assert(coords.size() > 0 && coords.size() < 3);
8517
8518 coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
8519 Operand((uint32_t) 0) :
8520 Operand((uint32_t) 0x3f000000)));
8521 }
8522
8523 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
8524
8525 if (instr->op == nir_texop_samples_identical)
8526 resource = fmask_ptr;
8527
8528 else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8529 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8530 instr->op != nir_texop_txs &&
8531 instr->op != nir_texop_fragment_fetch &&
8532 instr->op != nir_texop_fragment_mask_fetch) {
8533 assert(has_sample_index);
8534 Operand op(sample_index);
8535 if (sample_index_cv)
8536 op = Operand(sample_index_cv->u32);
8537 sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
8538 }
8539
8540 if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
8541 for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
8542 Temp off = emit_extract_vector(ctx, offset, i, v1);
8543 coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
8544 }
8545 has_offset = false;
8546 }
8547
8548 /* Build tex instruction */
8549 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
8550 unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
8551 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
8552 : 0;
8553 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8554 Temp tmp_dst = dst;
8555
8556 /* gather4 selects the component by dmask and always returns vec4 */
8557 if (instr->op == nir_texop_tg4) {
8558 assert(instr->dest.ssa.num_components == 4);
8559 if (instr->is_shadow)
8560 dmask = 1;
8561 else
8562 dmask = 1 << instr->component;
8563 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
8564 tmp_dst = bld.tmp(v4);
8565 } else if (instr->op == nir_texop_samples_identical) {
8566 tmp_dst = bld.tmp(v1);
8567 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
8568 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
8569 }
8570
8571 aco_ptr<MIMG_instruction> tex;
8572 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
8573 if (!has_lod)
8574 lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8575
8576 bool div_by_6 = instr->op == nir_texop_txs &&
8577 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8578 instr->is_array &&
8579 (dmask & (1 << 2));
8580 if (tmp_dst.id() == dst.id() && div_by_6)
8581 tmp_dst = bld.tmp(tmp_dst.regClass());
8582
8583 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8584 tex->operands[0] = Operand(resource);
8585 tex->operands[1] = Operand(s4); /* no sampler */
8586 tex->operands[2] = Operand(as_vgpr(ctx,lod));
8587 if (ctx->options->chip_class == GFX9 &&
8588 instr->op == nir_texop_txs &&
8589 instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8590 instr->is_array) {
8591 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
8592 } else if (instr->op == nir_texop_query_levels) {
8593 tex->dmask = 1 << 3;
8594 } else {
8595 tex->dmask = dmask;
8596 }
8597 tex->da = da;
8598 tex->definitions[0] = Definition(tmp_dst);
8599 tex->dim = dim;
8600 tex->can_reorder = true;
8601 ctx->block->instructions.emplace_back(std::move(tex));
8602
8603 if (div_by_6) {
8604 /* divide 3rd value by 6 by multiplying with magic number */
8605 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8606 Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
8607 Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
8608 assert(instr->dest.ssa.num_components == 3);
8609 Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
8610 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8611 emit_extract_vector(ctx, tmp_dst, 0, v1),
8612 emit_extract_vector(ctx, tmp_dst, 1, v1),
8613 by_6);
8614
8615 }
8616
8617 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8618 return;
8619 }
8620
8621 Temp tg4_compare_cube_wa64 = Temp();
8622
8623 if (tg4_integer_workarounds) {
8624 tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8625 tex->operands[0] = Operand(resource);
8626 tex->operands[1] = Operand(s4); /* no sampler */
8627 tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8628 tex->dim = dim;
8629 tex->dmask = 0x3;
8630 tex->da = da;
8631 Temp size = bld.tmp(v2);
8632 tex->definitions[0] = Definition(size);
8633 tex->can_reorder = true;
8634 ctx->block->instructions.emplace_back(std::move(tex));
8635 emit_split_vector(ctx, size, size.size());
8636
8637 Temp half_texel[2];
8638 for (unsigned i = 0; i < 2; i++) {
8639 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
8640 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
8641 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
8642 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
8643 }
8644
8645 Temp new_coords[2] = {
8646 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
8647 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
8648 };
8649
8650 if (tg4_integer_cube_workaround) {
8651 // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
8652 Temp desc[resource.size()];
8653 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
8654 Format::PSEUDO, 1, resource.size())};
8655 split->operands[0] = Operand(resource);
8656 for (unsigned i = 0; i < resource.size(); i++) {
8657 desc[i] = bld.tmp(s1);
8658 split->definitions[i] = Definition(desc[i]);
8659 }
8660 ctx->block->instructions.emplace_back(std::move(split));
8661
8662 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
8663 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
8664 Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
8665
8666 Temp nfmt;
8667 if (stype == GLSL_TYPE_UINT) {
8668 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8669 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
8670 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
8671 bld.scc(compare_cube_wa));
8672 } else {
8673 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8674 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
8675 Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
8676 bld.scc(compare_cube_wa));
8677 }
8678 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
8679 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
8680
8681 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
8682
8683 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
8684 Operand((uint32_t)C_008F14_NUM_FORMAT));
8685 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
8686
8687 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
8688 Format::PSEUDO, resource.size(), 1)};
8689 for (unsigned i = 0; i < resource.size(); i++)
8690 vec->operands[i] = Operand(desc[i]);
8691 resource = bld.tmp(resource.regClass());
8692 vec->definitions[0] = Definition(resource);
8693 ctx->block->instructions.emplace_back(std::move(vec));
8694
8695 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8696 new_coords[0], coords[0], tg4_compare_cube_wa64);
8697 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8698 new_coords[1], coords[1], tg4_compare_cube_wa64);
8699 }
8700 coords[0] = new_coords[0];
8701 coords[1] = new_coords[1];
8702 }
8703
8704 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8705 //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
8706
8707 assert(coords.size() == 1);
8708 unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
8709 aco_opcode op;
8710 switch (last_bit) {
8711 case 1:
8712 op = aco_opcode::buffer_load_format_x; break;
8713 case 2:
8714 op = aco_opcode::buffer_load_format_xy; break;
8715 case 3:
8716 op = aco_opcode::buffer_load_format_xyz; break;
8717 case 4:
8718 op = aco_opcode::buffer_load_format_xyzw; break;
8719 default:
8720 unreachable("Tex instruction loads more than 4 components.");
8721 }
8722
8723 /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
8724 if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
8725 tmp_dst = dst;
8726 else
8727 tmp_dst = bld.tmp(RegType::vgpr, last_bit);
8728
8729 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
8730 mubuf->operands[0] = Operand(resource);
8731 mubuf->operands[1] = Operand(coords[0]);
8732 mubuf->operands[2] = Operand((uint32_t) 0);
8733 mubuf->definitions[0] = Definition(tmp_dst);
8734 mubuf->idxen = true;
8735 mubuf->can_reorder = true;
8736 ctx->block->instructions.emplace_back(std::move(mubuf));
8737
8738 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
8739 return;
8740 }
8741
8742 /* gather MIMG address components */
8743 std::vector<Temp> args;
8744 if (has_offset)
8745 args.emplace_back(offset);
8746 if (has_bias)
8747 args.emplace_back(bias);
8748 if (has_compare)
8749 args.emplace_back(compare);
8750 if (has_derivs)
8751 args.insert(args.end(), derivs.begin(), derivs.end());
8752
8753 args.insert(args.end(), coords.begin(), coords.end());
8754 if (has_sample_index)
8755 args.emplace_back(sample_index);
8756 if (has_lod)
8757 args.emplace_back(lod);
8758 if (has_clamped_lod)
8759 args.emplace_back(clamped_lod);
8760
8761 Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
8762 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
8763 vec->definitions[0] = Definition(arg);
8764 for (unsigned i = 0; i < args.size(); i++)
8765 vec->operands[i] = Operand(args[i]);
8766 ctx->block->instructions.emplace_back(std::move(vec));
8767
8768
8769 if (instr->op == nir_texop_txf ||
8770 instr->op == nir_texop_txf_ms ||
8771 instr->op == nir_texop_samples_identical ||
8772 instr->op == nir_texop_fragment_fetch ||
8773 instr->op == nir_texop_fragment_mask_fetch) {
8774 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
8775 tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
8776 tex->operands[0] = Operand(resource);
8777 tex->operands[1] = Operand(s4); /* no sampler */
8778 tex->operands[2] = Operand(arg);
8779 tex->dim = dim;
8780 tex->dmask = dmask;
8781 tex->unrm = true;
8782 tex->da = da;
8783 tex->definitions[0] = Definition(tmp_dst);
8784 tex->can_reorder = true;
8785 ctx->block->instructions.emplace_back(std::move(tex));
8786
8787 if (instr->op == nir_texop_samples_identical) {
8788 assert(dmask == 1 && dst.regClass() == v1);
8789 assert(dst.id() != tmp_dst.id());
8790
8791 Temp tmp = bld.tmp(bld.lm);
8792 bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8793 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8794
8795 } else {
8796 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8797 }
8798 return;
8799 }
8800
8801 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8802 aco_opcode opcode = aco_opcode::image_sample;
8803 if (has_offset) { /* image_sample_*_o */
8804 if (has_clamped_lod) {
8805 if (has_compare) {
8806 opcode = aco_opcode::image_sample_c_cl_o;
8807 if (has_derivs)
8808 opcode = aco_opcode::image_sample_c_d_cl_o;
8809 if (has_bias)
8810 opcode = aco_opcode::image_sample_c_b_cl_o;
8811 } else {
8812 opcode = aco_opcode::image_sample_cl_o;
8813 if (has_derivs)
8814 opcode = aco_opcode::image_sample_d_cl_o;
8815 if (has_bias)
8816 opcode = aco_opcode::image_sample_b_cl_o;
8817 }
8818 } else if (has_compare) {
8819 opcode = aco_opcode::image_sample_c_o;
8820 if (has_derivs)
8821 opcode = aco_opcode::image_sample_c_d_o;
8822 if (has_bias)
8823 opcode = aco_opcode::image_sample_c_b_o;
8824 if (level_zero)
8825 opcode = aco_opcode::image_sample_c_lz_o;
8826 if (has_lod)
8827 opcode = aco_opcode::image_sample_c_l_o;
8828 } else {
8829 opcode = aco_opcode::image_sample_o;
8830 if (has_derivs)
8831 opcode = aco_opcode::image_sample_d_o;
8832 if (has_bias)
8833 opcode = aco_opcode::image_sample_b_o;
8834 if (level_zero)
8835 opcode = aco_opcode::image_sample_lz_o;
8836 if (has_lod)
8837 opcode = aco_opcode::image_sample_l_o;
8838 }
8839 } else if (has_clamped_lod) { /* image_sample_*_cl */
8840 if (has_compare) {
8841 opcode = aco_opcode::image_sample_c_cl;
8842 if (has_derivs)
8843 opcode = aco_opcode::image_sample_c_d_cl;
8844 if (has_bias)
8845 opcode = aco_opcode::image_sample_c_b_cl;
8846 } else {
8847 opcode = aco_opcode::image_sample_cl;
8848 if (has_derivs)
8849 opcode = aco_opcode::image_sample_d_cl;
8850 if (has_bias)
8851 opcode = aco_opcode::image_sample_b_cl;
8852 }
8853 } else { /* no offset */
8854 if (has_compare) {
8855 opcode = aco_opcode::image_sample_c;
8856 if (has_derivs)
8857 opcode = aco_opcode::image_sample_c_d;
8858 if (has_bias)
8859 opcode = aco_opcode::image_sample_c_b;
8860 if (level_zero)
8861 opcode = aco_opcode::image_sample_c_lz;
8862 if (has_lod)
8863 opcode = aco_opcode::image_sample_c_l;
8864 } else {
8865 opcode = aco_opcode::image_sample;
8866 if (has_derivs)
8867 opcode = aco_opcode::image_sample_d;
8868 if (has_bias)
8869 opcode = aco_opcode::image_sample_b;
8870 if (level_zero)
8871 opcode = aco_opcode::image_sample_lz;
8872 if (has_lod)
8873 opcode = aco_opcode::image_sample_l;
8874 }
8875 }
8876
8877 if (instr->op == nir_texop_tg4) {
8878 if (has_offset) { /* image_gather4_*_o */
8879 if (has_compare) {
8880 opcode = aco_opcode::image_gather4_c_lz_o;
8881 if (has_lod)
8882 opcode = aco_opcode::image_gather4_c_l_o;
8883 if (has_bias)
8884 opcode = aco_opcode::image_gather4_c_b_o;
8885 } else {
8886 opcode = aco_opcode::image_gather4_lz_o;
8887 if (has_lod)
8888 opcode = aco_opcode::image_gather4_l_o;
8889 if (has_bias)
8890 opcode = aco_opcode::image_gather4_b_o;
8891 }
8892 } else {
8893 if (has_compare) {
8894 opcode = aco_opcode::image_gather4_c_lz;
8895 if (has_lod)
8896 opcode = aco_opcode::image_gather4_c_l;
8897 if (has_bias)
8898 opcode = aco_opcode::image_gather4_c_b;
8899 } else {
8900 opcode = aco_opcode::image_gather4_lz;
8901 if (has_lod)
8902 opcode = aco_opcode::image_gather4_l;
8903 if (has_bias)
8904 opcode = aco_opcode::image_gather4_b;
8905 }
8906 }
8907 } else if (instr->op == nir_texop_lod) {
8908 opcode = aco_opcode::image_get_lod;
8909 }
8910
8911 /* we don't need the bias, sample index, compare value or offset to be
8912 * computed in WQM but if the p_create_vector copies the coordinates, then it
8913 * needs to be in WQM */
8914 if (ctx->stage == fragment_fs &&
8915 !has_derivs && !has_lod && !level_zero &&
8916 instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8917 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8918 arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8919
8920 tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8921 tex->operands[0] = Operand(resource);
8922 tex->operands[1] = Operand(sampler);
8923 tex->operands[2] = Operand(arg);
8924 tex->dim = dim;
8925 tex->dmask = dmask;
8926 tex->da = da;
8927 tex->definitions[0] = Definition(tmp_dst);
8928 tex->can_reorder = true;
8929 ctx->block->instructions.emplace_back(std::move(tex));
8930
8931 if (tg4_integer_cube_workaround) {
8932 assert(tmp_dst.id() != dst.id());
8933 assert(tmp_dst.size() == dst.size() && dst.size() == 4);
8934
8935 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8936 Temp val[4];
8937 for (unsigned i = 0; i < dst.size(); i++) {
8938 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
8939 Temp cvt_val;
8940 if (stype == GLSL_TYPE_UINT)
8941 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
8942 else
8943 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
8944 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
8945 }
8946 Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
8947 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8948 val[0], val[1], val[2], val[3]);
8949 }
8950 unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
8951 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
8952
8953 }
8954
8955
8956 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
8957 {
8958 Temp tmp = get_ssa_temp(ctx, ssa);
8959 if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
8960 return Operand(rc);
8961 } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
8962 if (ctx->program->wave_size == 64)
8963 return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
8964 else
8965 return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
8966 } else {
8967 return Operand(tmp);
8968 }
8969 }
8970
8971 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
8972 {
8973 aco_ptr<Pseudo_instruction> phi;
8974 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8975 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
8976
8977 bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
8978 logical |= ctx->block->kind & block_kind_merge;
8979 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
8980
8981 /* we want a sorted list of sources, since the predecessor list is also sorted */
8982 std::map<unsigned, nir_ssa_def*> phi_src;
8983 nir_foreach_phi_src(src, instr)
8984 phi_src[src->pred->index] = src->src.ssa;
8985
8986 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
8987 unsigned num_operands = 0;
8988 Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
8989 unsigned num_defined = 0;
8990 unsigned cur_pred_idx = 0;
8991 for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
8992 if (cur_pred_idx < preds.size()) {
8993 /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
8994 unsigned block = ctx->cf_info.nir_to_aco[src.first];
8995 unsigned skipped = 0;
8996 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
8997 skipped++;
8998 if (cur_pred_idx + skipped < preds.size()) {
8999 for (unsigned i = 0; i < skipped; i++)
9000 operands[num_operands++] = Operand(dst.regClass());
9001 cur_pred_idx += skipped;
9002 } else {
9003 continue;
9004 }
9005 }
9006 /* Handle missing predecessors at the end. This shouldn't happen with loop
9007 * headers and we can't ignore these sources for loop header phis. */
9008 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9009 continue;
9010 cur_pred_idx++;
9011 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9012 operands[num_operands++] = op;
9013 num_defined += !op.isUndefined();
9014 }
9015 /* handle block_kind_continue_or_break at loop exit blocks */
9016 while (cur_pred_idx++ < preds.size())
9017 operands[num_operands++] = Operand(dst.regClass());
9018
9019 /* If the loop ends with a break, still add a linear continue edge in case
9020 * that break is divergent or continue_or_break is used. We'll either remove
9021 * this operand later in visit_loop() if it's not necessary or replace the
9022 * undef with something correct. */
9023 if (!logical && ctx->block->kind & block_kind_loop_header) {
9024 nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9025 nir_block *last = nir_loop_last_block(loop);
9026 if (last->successors[0] != instr->instr.block)
9027 operands[num_operands++] = Operand(RegClass());
9028 }
9029
9030 if (num_defined == 0) {
9031 Builder bld(ctx->program, ctx->block);
9032 if (dst.regClass() == s1) {
9033 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
9034 } else if (dst.regClass() == v1) {
9035 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
9036 } else {
9037 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9038 for (unsigned i = 0; i < dst.size(); i++)
9039 vec->operands[i] = Operand(0u);
9040 vec->definitions[0] = Definition(dst);
9041 ctx->block->instructions.emplace_back(std::move(vec));
9042 }
9043 return;
9044 }
9045
9046 /* we can use a linear phi in some cases if one src is undef */
9047 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9048 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
9049
9050 Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9051 Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9052 assert(invert->kind & block_kind_invert);
9053
9054 unsigned then_block = invert->linear_preds[0];
9055
9056 Block* insert_block = NULL;
9057 for (unsigned i = 0; i < num_operands; i++) {
9058 Operand op = operands[i];
9059 if (op.isUndefined())
9060 continue;
9061 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9062 phi->operands[0] = op;
9063 break;
9064 }
9065 assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9066 phi->operands[1] = Operand(dst.regClass());
9067 phi->definitions[0] = Definition(dst);
9068 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9069 return;
9070 }
9071
9072 /* try to scalarize vector phis */
9073 if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
9074 // TODO: scalarize linear phis on divergent ifs
9075 bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
9076 std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
9077 for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
9078 Operand src = operands[i];
9079 if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
9080 can_scalarize = false;
9081 }
9082 if (can_scalarize) {
9083 unsigned num_components = instr->dest.ssa.num_components;
9084 assert(dst.size() % num_components == 0);
9085 RegClass rc = RegClass(dst.type(), dst.size() / num_components);
9086
9087 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
9088 for (unsigned k = 0; k < num_components; k++) {
9089 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9090 for (unsigned i = 0; i < num_operands; i++) {
9091 Operand src = operands[i];
9092 phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
9093 }
9094 Temp phi_dst = {ctx->program->allocateId(), rc};
9095 phi->definitions[0] = Definition(phi_dst);
9096 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9097 new_vec[k] = phi_dst;
9098 vec->operands[k] = Operand(phi_dst);
9099 }
9100 vec->definitions[0] = Definition(dst);
9101 ctx->block->instructions.emplace_back(std::move(vec));
9102 ctx->allocated_vec.emplace(dst.id(), new_vec);
9103 return;
9104 }
9105 }
9106
9107 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9108 for (unsigned i = 0; i < num_operands; i++)
9109 phi->operands[i] = operands[i];
9110 phi->definitions[0] = Definition(dst);
9111 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9112 }
9113
9114
9115 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
9116 {
9117 Temp dst = get_ssa_temp(ctx, &instr->def);
9118
9119 assert(dst.type() == RegType::sgpr);
9120
9121 if (dst.size() == 1) {
9122 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
9123 } else {
9124 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9125 for (unsigned i = 0; i < dst.size(); i++)
9126 vec->operands[i] = Operand(0u);
9127 vec->definitions[0] = Definition(dst);
9128 ctx->block->instructions.emplace_back(std::move(vec));
9129 }
9130 }
9131
9132 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
9133 {
9134 Builder bld(ctx->program, ctx->block);
9135 Block *logical_target;
9136 append_logical_end(ctx->block);
9137 unsigned idx = ctx->block->index;
9138
9139 switch (instr->type) {
9140 case nir_jump_break:
9141 logical_target = ctx->cf_info.parent_loop.exit;
9142 add_logical_edge(idx, logical_target);
9143 ctx->block->kind |= block_kind_break;
9144
9145 if (!ctx->cf_info.parent_if.is_divergent &&
9146 !ctx->cf_info.parent_loop.has_divergent_continue) {
9147 /* uniform break - directly jump out of the loop */
9148 ctx->block->kind |= block_kind_uniform;
9149 ctx->cf_info.has_branch = true;
9150 bld.branch(aco_opcode::p_branch);
9151 add_linear_edge(idx, logical_target);
9152 return;
9153 }
9154 ctx->cf_info.parent_loop.has_divergent_branch = true;
9155 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9156 break;
9157 case nir_jump_continue:
9158 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9159 add_logical_edge(idx, logical_target);
9160 ctx->block->kind |= block_kind_continue;
9161
9162 if (ctx->cf_info.parent_if.is_divergent) {
9163 /* for potential uniform breaks after this continue,
9164 we must ensure that they are handled correctly */
9165 ctx->cf_info.parent_loop.has_divergent_continue = true;
9166 ctx->cf_info.parent_loop.has_divergent_branch = true;
9167 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9168 } else {
9169 /* uniform continue - directly jump to the loop header */
9170 ctx->block->kind |= block_kind_uniform;
9171 ctx->cf_info.has_branch = true;
9172 bld.branch(aco_opcode::p_branch);
9173 add_linear_edge(idx, logical_target);
9174 return;
9175 }
9176 break;
9177 default:
9178 fprintf(stderr, "Unknown NIR jump instr: ");
9179 nir_print_instr(&instr->instr, stderr);
9180 fprintf(stderr, "\n");
9181 abort();
9182 }
9183
9184 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
9185 ctx->cf_info.exec_potentially_empty_break = true;
9186 ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
9187 }
9188
9189 /* remove critical edges from linear CFG */
9190 bld.branch(aco_opcode::p_branch);
9191 Block* break_block = ctx->program->create_and_insert_block();
9192 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9193 break_block->kind |= block_kind_uniform;
9194 add_linear_edge(idx, break_block);
9195 /* the loop_header pointer might be invalidated by this point */
9196 if (instr->type == nir_jump_continue)
9197 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9198 add_linear_edge(break_block->index, logical_target);
9199 bld.reset(break_block);
9200 bld.branch(aco_opcode::p_branch);
9201
9202 Block* continue_block = ctx->program->create_and_insert_block();
9203 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9204 add_linear_edge(idx, continue_block);
9205 append_logical_start(continue_block);
9206 ctx->block = continue_block;
9207 return;
9208 }
9209
9210 void visit_block(isel_context *ctx, nir_block *block)
9211 {
9212 nir_foreach_instr(instr, block) {
9213 switch (instr->type) {
9214 case nir_instr_type_alu:
9215 visit_alu_instr(ctx, nir_instr_as_alu(instr));
9216 break;
9217 case nir_instr_type_load_const:
9218 visit_load_const(ctx, nir_instr_as_load_const(instr));
9219 break;
9220 case nir_instr_type_intrinsic:
9221 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
9222 break;
9223 case nir_instr_type_tex:
9224 visit_tex(ctx, nir_instr_as_tex(instr));
9225 break;
9226 case nir_instr_type_phi:
9227 visit_phi(ctx, nir_instr_as_phi(instr));
9228 break;
9229 case nir_instr_type_ssa_undef:
9230 visit_undef(ctx, nir_instr_as_ssa_undef(instr));
9231 break;
9232 case nir_instr_type_deref:
9233 break;
9234 case nir_instr_type_jump:
9235 visit_jump(ctx, nir_instr_as_jump(instr));
9236 break;
9237 default:
9238 fprintf(stderr, "Unknown NIR instr type: ");
9239 nir_print_instr(instr, stderr);
9240 fprintf(stderr, "\n");
9241 //abort();
9242 }
9243 }
9244
9245 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9246 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
9247 }
9248
9249
9250
9251 static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
9252 aco_ptr<Instruction>& header_phi, Operand *vals)
9253 {
9254 vals[0] = Operand(header_phi->definitions[0].getTemp());
9255 RegClass rc = vals[0].regClass();
9256
9257 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
9258
9259 unsigned next_pred = 1;
9260
9261 for (unsigned idx = first + 1; idx <= last; idx++) {
9262 Block& block = ctx->program->blocks[idx];
9263 if (block.loop_nest_depth != loop_nest_depth) {
9264 vals[idx - first] = vals[idx - 1 - first];
9265 continue;
9266 }
9267
9268 if (block.kind & block_kind_continue) {
9269 vals[idx - first] = header_phi->operands[next_pred];
9270 next_pred++;
9271 continue;
9272 }
9273
9274 bool all_same = true;
9275 for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
9276 all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
9277
9278 Operand val;
9279 if (all_same) {
9280 val = vals[block.linear_preds[0] - first];
9281 } else {
9282 aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
9283 aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
9284 for (unsigned i = 0; i < block.linear_preds.size(); i++)
9285 phi->operands[i] = vals[block.linear_preds[i] - first];
9286 val = Operand(Temp(ctx->program->allocateId(), rc));
9287 phi->definitions[0] = Definition(val.getTemp());
9288 block.instructions.emplace(block.instructions.begin(), std::move(phi));
9289 }
9290 vals[idx - first] = val;
9291 }
9292
9293 return vals[last - first];
9294 }
9295
9296 static void visit_loop(isel_context *ctx, nir_loop *loop)
9297 {
9298 //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9299 append_logical_end(ctx->block);
9300 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9301 Builder bld(ctx->program, ctx->block);
9302 bld.branch(aco_opcode::p_branch);
9303 unsigned loop_preheader_idx = ctx->block->index;
9304
9305 Block loop_exit = Block();
9306 loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9307 loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9308
9309 Block* loop_header = ctx->program->create_and_insert_block();
9310 loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
9311 loop_header->kind |= block_kind_loop_header;
9312 add_edge(loop_preheader_idx, loop_header);
9313 ctx->block = loop_header;
9314
9315 /* emit loop body */
9316 unsigned loop_header_idx = loop_header->index;
9317 loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
9318 append_logical_start(ctx->block);
9319 bool unreachable = visit_cf_list(ctx, &loop->body);
9320
9321 //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
9322 if (!ctx->cf_info.has_branch) {
9323 append_logical_end(ctx->block);
9324 if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
9325 /* Discards can result in code running with an empty exec mask.
9326 * This would result in divergent breaks not ever being taken. As a
9327 * workaround, break the loop when the loop mask is empty instead of
9328 * always continuing. */
9329 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9330 unsigned block_idx = ctx->block->index;
9331
9332 /* create helper blocks to avoid critical edges */
9333 Block *break_block = ctx->program->create_and_insert_block();
9334 break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9335 break_block->kind = block_kind_uniform;
9336 bld.reset(break_block);
9337 bld.branch(aco_opcode::p_branch);
9338 add_linear_edge(block_idx, break_block);
9339 add_linear_edge(break_block->index, &loop_exit);
9340
9341 Block *continue_block = ctx->program->create_and_insert_block();
9342 continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9343 continue_block->kind = block_kind_uniform;
9344 bld.reset(continue_block);
9345 bld.branch(aco_opcode::p_branch);
9346 add_linear_edge(block_idx, continue_block);
9347 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9348
9349 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9350 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9351 ctx->block = &ctx->program->blocks[block_idx];
9352 } else {
9353 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9354 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9355 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9356 else
9357 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9358 }
9359
9360 bld.reset(ctx->block);
9361 bld.branch(aco_opcode::p_branch);
9362 }
9363
9364 /* Fixup phis in loop header from unreachable blocks.
9365 * has_branch/has_divergent_branch also indicates if the loop ends with a
9366 * break/continue instruction, but we don't emit those if unreachable=true */
9367 if (unreachable) {
9368 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
9369 bool linear = ctx->cf_info.has_branch;
9370 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
9371 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9372 if ((logical && instr->opcode == aco_opcode::p_phi) ||
9373 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
9374 /* the last operand should be the one that needs to be removed */
9375 instr->operands.pop_back();
9376 } else if (!is_phi(instr)) {
9377 break;
9378 }
9379 }
9380 }
9381
9382 /* Fixup linear phis in loop header from expecting a continue. Both this fixup
9383 * and the previous one shouldn't both happen at once because a break in the
9384 * merge block would get CSE'd */
9385 if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
9386 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
9387 Operand vals[num_vals];
9388 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9389 if (instr->opcode == aco_opcode::p_linear_phi) {
9390 if (ctx->cf_info.has_branch)
9391 instr->operands.pop_back();
9392 else
9393 instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
9394 } else if (!is_phi(instr)) {
9395 break;
9396 }
9397 }
9398 }
9399
9400 ctx->cf_info.has_branch = false;
9401
9402 // TODO: if the loop has not a single exit, we must add one °°
9403 /* emit loop successor block */
9404 ctx->block = ctx->program->insert_block(std::move(loop_exit));
9405 append_logical_start(ctx->block);
9406
9407 #if 0
9408 // TODO: check if it is beneficial to not branch on continues
9409 /* trim linear phis in loop header */
9410 for (auto&& instr : loop_entry->instructions) {
9411 if (instr->opcode == aco_opcode::p_linear_phi) {
9412 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9413 new_phi->definitions[0] = instr->definitions[0];
9414 for (unsigned i = 0; i < new_phi->operands.size(); i++)
9415 new_phi->operands[i] = instr->operands[i];
9416 /* check that the remaining operands are all the same */
9417 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9418 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9419 instr.swap(new_phi);
9420 } else if (instr->opcode == aco_opcode::p_phi) {
9421 continue;
9422 } else {
9423 break;
9424 }
9425 }
9426 #endif
9427 }
9428
9429 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
9430 {
9431 ic->cond = cond;
9432
9433 append_logical_end(ctx->block);
9434 ctx->block->kind |= block_kind_branch;
9435
9436 /* branch to linear then block */
9437 assert(cond.regClass() == ctx->program->lane_mask);
9438 aco_ptr<Pseudo_branch_instruction> branch;
9439 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
9440 branch->operands[0] = Operand(cond);
9441 ctx->block->instructions.push_back(std::move(branch));
9442
9443 ic->BB_if_idx = ctx->block->index;
9444 ic->BB_invert = Block();
9445 ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9446 /* Invert blocks are intentionally not marked as top level because they
9447 * are not part of the logical cfg. */
9448 ic->BB_invert.kind |= block_kind_invert;
9449 ic->BB_endif = Block();
9450 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9451 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
9452
9453 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
9454 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
9455 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
9456 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
9457 ctx->cf_info.parent_if.is_divergent = true;
9458
9459 /* divergent branches use cbranch_execz */
9460 ctx->cf_info.exec_potentially_empty_discard = false;
9461 ctx->cf_info.exec_potentially_empty_break = false;
9462 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9463
9464 /** emit logical then block */
9465 Block* BB_then_logical = ctx->program->create_and_insert_block();
9466 BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9467 add_edge(ic->BB_if_idx, BB_then_logical);
9468 ctx->block = BB_then_logical;
9469 append_logical_start(BB_then_logical);
9470 }
9471
9472 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
9473 {
9474 Block *BB_then_logical = ctx->block;
9475 append_logical_end(BB_then_logical);
9476 /* branch from logical then block to invert block */
9477 aco_ptr<Pseudo_branch_instruction> branch;
9478 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9479 BB_then_logical->instructions.emplace_back(std::move(branch));
9480 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
9481 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9482 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
9483 BB_then_logical->kind |= block_kind_uniform;
9484 assert(!ctx->cf_info.has_branch);
9485 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9486 ctx->cf_info.parent_loop.has_divergent_branch = false;
9487
9488 /** emit linear then block */
9489 Block* BB_then_linear = ctx->program->create_and_insert_block();
9490 BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9491 BB_then_linear->kind |= block_kind_uniform;
9492 add_linear_edge(ic->BB_if_idx, BB_then_linear);
9493 /* branch from linear then block to invert block */
9494 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9495 BB_then_linear->instructions.emplace_back(std::move(branch));
9496 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
9497
9498 /** emit invert merge block */
9499 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
9500 ic->invert_idx = ctx->block->index;
9501
9502 /* branch to linear else block (skip else) */
9503 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
9504 branch->operands[0] = Operand(ic->cond);
9505 ctx->block->instructions.push_back(std::move(branch));
9506
9507 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
9508 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
9509 ic->exec_potentially_empty_break_depth_old =
9510 std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9511 /* divergent branches use cbranch_execz */
9512 ctx->cf_info.exec_potentially_empty_discard = false;
9513 ctx->cf_info.exec_potentially_empty_break = false;
9514 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9515
9516 /** emit logical else block */
9517 Block* BB_else_logical = ctx->program->create_and_insert_block();
9518 BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9519 add_logical_edge(ic->BB_if_idx, BB_else_logical);
9520 add_linear_edge(ic->invert_idx, BB_else_logical);
9521 ctx->block = BB_else_logical;
9522 append_logical_start(BB_else_logical);
9523 }
9524
9525 static void end_divergent_if(isel_context *ctx, if_context *ic)
9526 {
9527 Block *BB_else_logical = ctx->block;
9528 append_logical_end(BB_else_logical);
9529
9530 /* branch from logical else block to endif block */
9531 aco_ptr<Pseudo_branch_instruction> branch;
9532 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9533 BB_else_logical->instructions.emplace_back(std::move(branch));
9534 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
9535 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9536 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
9537 BB_else_logical->kind |= block_kind_uniform;
9538
9539 assert(!ctx->cf_info.has_branch);
9540 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9541
9542
9543 /** emit linear else block */
9544 Block* BB_else_linear = ctx->program->create_and_insert_block();
9545 BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9546 BB_else_linear->kind |= block_kind_uniform;
9547 add_linear_edge(ic->invert_idx, BB_else_linear);
9548
9549 /* branch from linear else block to endif block */
9550 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9551 BB_else_linear->instructions.emplace_back(std::move(branch));
9552 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
9553
9554
9555 /** emit endif merge block */
9556 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9557 append_logical_start(ctx->block);
9558
9559
9560 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
9561 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
9562 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
9563 ctx->cf_info.exec_potentially_empty_break_depth =
9564 std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9565 if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
9566 !ctx->cf_info.parent_if.is_divergent) {
9567 ctx->cf_info.exec_potentially_empty_break = false;
9568 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9569 }
9570 /* uniform control flow never has an empty exec-mask */
9571 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
9572 ctx->cf_info.exec_potentially_empty_discard = false;
9573 ctx->cf_info.exec_potentially_empty_break = false;
9574 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9575 }
9576 }
9577
9578 static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
9579 {
9580 assert(cond.regClass() == s1);
9581
9582 append_logical_end(ctx->block);
9583 ctx->block->kind |= block_kind_uniform;
9584
9585 aco_ptr<Pseudo_branch_instruction> branch;
9586 aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
9587 branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
9588 branch->operands[0] = Operand(cond);
9589 branch->operands[0].setFixed(scc);
9590 ctx->block->instructions.emplace_back(std::move(branch));
9591
9592 ic->BB_if_idx = ctx->block->index;
9593 ic->BB_endif = Block();
9594 ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9595 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
9596
9597 ctx->cf_info.has_branch = false;
9598 ctx->cf_info.parent_loop.has_divergent_branch = false;
9599
9600 /** emit then block */
9601 Block* BB_then = ctx->program->create_and_insert_block();
9602 BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9603 add_edge(ic->BB_if_idx, BB_then);
9604 append_logical_start(BB_then);
9605 ctx->block = BB_then;
9606 }
9607
9608 static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
9609 {
9610 Block *BB_then = ctx->block;
9611
9612 ic->uniform_has_then_branch = ctx->cf_info.has_branch;
9613 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9614
9615 if (!ic->uniform_has_then_branch) {
9616 append_logical_end(BB_then);
9617 /* branch from then block to endif block */
9618 aco_ptr<Pseudo_branch_instruction> branch;
9619 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9620 BB_then->instructions.emplace_back(std::move(branch));
9621 add_linear_edge(BB_then->index, &ic->BB_endif);
9622 if (!ic->then_branch_divergent)
9623 add_logical_edge(BB_then->index, &ic->BB_endif);
9624 BB_then->kind |= block_kind_uniform;
9625 }
9626
9627 ctx->cf_info.has_branch = false;
9628 ctx->cf_info.parent_loop.has_divergent_branch = false;
9629
9630 /** emit else block */
9631 Block* BB_else = ctx->program->create_and_insert_block();
9632 BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9633 add_edge(ic->BB_if_idx, BB_else);
9634 append_logical_start(BB_else);
9635 ctx->block = BB_else;
9636 }
9637
9638 static void end_uniform_if(isel_context *ctx, if_context *ic)
9639 {
9640 Block *BB_else = ctx->block;
9641
9642 if (!ctx->cf_info.has_branch) {
9643 append_logical_end(BB_else);
9644 /* branch from then block to endif block */
9645 aco_ptr<Pseudo_branch_instruction> branch;
9646 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9647 BB_else->instructions.emplace_back(std::move(branch));
9648 add_linear_edge(BB_else->index, &ic->BB_endif);
9649 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9650 add_logical_edge(BB_else->index, &ic->BB_endif);
9651 BB_else->kind |= block_kind_uniform;
9652 }
9653
9654 ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
9655 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9656
9657 /** emit endif merge block */
9658 if (!ctx->cf_info.has_branch) {
9659 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9660 append_logical_start(ctx->block);
9661 }
9662 }
9663
9664 static bool visit_if(isel_context *ctx, nir_if *if_stmt)
9665 {
9666 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
9667 Builder bld(ctx->program, ctx->block);
9668 aco_ptr<Pseudo_branch_instruction> branch;
9669 if_context ic;
9670
9671 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
9672 /**
9673 * Uniform conditionals are represented in the following way*) :
9674 *
9675 * The linear and logical CFG:
9676 * BB_IF
9677 * / \
9678 * BB_THEN (logical) BB_ELSE (logical)
9679 * \ /
9680 * BB_ENDIF
9681 *
9682 * *) Exceptions may be due to break and continue statements within loops
9683 * If a break/continue happens within uniform control flow, it branches
9684 * to the loop exit/entry block. Otherwise, it branches to the next
9685 * merge block.
9686 **/
9687
9688 // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
9689 assert(cond.regClass() == ctx->program->lane_mask);
9690 cond = bool_to_scalar_condition(ctx, cond);
9691
9692 begin_uniform_if_then(ctx, &ic, cond);
9693 visit_cf_list(ctx, &if_stmt->then_list);
9694
9695 begin_uniform_if_else(ctx, &ic);
9696 visit_cf_list(ctx, &if_stmt->else_list);
9697
9698 end_uniform_if(ctx, &ic);
9699 } else { /* non-uniform condition */
9700 /**
9701 * To maintain a logical and linear CFG without critical edges,
9702 * non-uniform conditionals are represented in the following way*) :
9703 *
9704 * The linear CFG:
9705 * BB_IF
9706 * / \
9707 * BB_THEN (logical) BB_THEN (linear)
9708 * \ /
9709 * BB_INVERT (linear)
9710 * / \
9711 * BB_ELSE (logical) BB_ELSE (linear)
9712 * \ /
9713 * BB_ENDIF
9714 *
9715 * The logical CFG:
9716 * BB_IF
9717 * / \
9718 * BB_THEN (logical) BB_ELSE (logical)
9719 * \ /
9720 * BB_ENDIF
9721 *
9722 * *) Exceptions may be due to break and continue statements within loops
9723 **/
9724
9725 begin_divergent_if_then(ctx, &ic, cond);
9726 visit_cf_list(ctx, &if_stmt->then_list);
9727
9728 begin_divergent_if_else(ctx, &ic);
9729 visit_cf_list(ctx, &if_stmt->else_list);
9730
9731 end_divergent_if(ctx, &ic);
9732 }
9733
9734 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
9735 }
9736
9737 static bool visit_cf_list(isel_context *ctx,
9738 struct exec_list *list)
9739 {
9740 foreach_list_typed(nir_cf_node, node, node, list) {
9741 switch (node->type) {
9742 case nir_cf_node_block:
9743 visit_block(ctx, nir_cf_node_as_block(node));
9744 break;
9745 case nir_cf_node_if:
9746 if (!visit_if(ctx, nir_cf_node_as_if(node)))
9747 return true;
9748 break;
9749 case nir_cf_node_loop:
9750 visit_loop(ctx, nir_cf_node_as_loop(node));
9751 break;
9752 default:
9753 unreachable("unimplemented cf list type");
9754 }
9755 }
9756 return false;
9757 }
9758
9759 static void create_null_export(isel_context *ctx)
9760 {
9761 /* Some shader stages always need to have exports.
9762 * So when there is none, we need to add a null export.
9763 */
9764
9765 unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
9766 bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
9767 Builder bld(ctx->program, ctx->block);
9768 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
9769 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
9770 }
9771
9772 static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
9773 {
9774 assert(ctx->stage == vertex_vs ||
9775 ctx->stage == tess_eval_vs ||
9776 ctx->stage == gs_copy_vs ||
9777 ctx->stage == ngg_vertex_gs ||
9778 ctx->stage == ngg_tess_eval_gs);
9779
9780 int offset = (ctx->stage & sw_tes)
9781 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
9782 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
9783 uint64_t mask = ctx->outputs.mask[slot];
9784 if (!is_pos && !mask)
9785 return false;
9786 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
9787 return false;
9788 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9789 exp->enabled_mask = mask;
9790 for (unsigned i = 0; i < 4; ++i) {
9791 if (mask & (1 << i))
9792 exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9793 else
9794 exp->operands[i] = Operand(v1);
9795 }
9796 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
9797 * Setting valid_mask=1 prevents it and has no other effect.
9798 */
9799 exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
9800 exp->done = false;
9801 exp->compressed = false;
9802 if (is_pos)
9803 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9804 else
9805 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
9806 ctx->block->instructions.emplace_back(std::move(exp));
9807
9808 return true;
9809 }
9810
9811 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
9812 {
9813 aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9814 exp->enabled_mask = 0;
9815 for (unsigned i = 0; i < 4; ++i)
9816 exp->operands[i] = Operand(v1);
9817 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
9818 exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
9819 exp->enabled_mask |= 0x1;
9820 }
9821 if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
9822 exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
9823 exp->enabled_mask |= 0x4;
9824 }
9825 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
9826 if (ctx->options->chip_class < GFX9) {
9827 exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
9828 exp->enabled_mask |= 0x8;
9829 } else {
9830 Builder bld(ctx->program, ctx->block);
9831
9832 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
9833 Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
9834 if (exp->operands[2].isTemp())
9835 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
9836
9837 exp->operands[2] = Operand(out);
9838 exp->enabled_mask |= 0x4;
9839 }
9840 }
9841 exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
9842 exp->done = false;
9843 exp->compressed = false;
9844 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9845 ctx->block->instructions.emplace_back(std::move(exp));
9846 }
9847
9848 static void create_export_phis(isel_context *ctx)
9849 {
9850 /* Used when exports are needed, but the output temps are defined in a preceding block.
9851 * This function will set up phis in order to access the outputs in the next block.
9852 */
9853
9854 assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
9855 aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
9856 ctx->block->instructions.pop_back();
9857
9858 Builder bld(ctx->program, ctx->block);
9859
9860 for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
9861 uint64_t mask = ctx->outputs.mask[slot];
9862 for (unsigned i = 0; i < 4; ++i) {
9863 if (!(mask & (1 << i)))
9864 continue;
9865
9866 Temp old = ctx->outputs.temps[slot * 4 + i];
9867 Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
9868 ctx->outputs.temps[slot * 4 + i] = phi;
9869 }
9870 }
9871
9872 bld.insert(std::move(logical_start));
9873 }
9874
9875 static void create_vs_exports(isel_context *ctx)
9876 {
9877 assert(ctx->stage == vertex_vs ||
9878 ctx->stage == tess_eval_vs ||
9879 ctx->stage == gs_copy_vs ||
9880 ctx->stage == ngg_vertex_gs ||
9881 ctx->stage == ngg_tess_eval_gs);
9882
9883 radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
9884 ? &ctx->program->info->tes.outinfo
9885 : &ctx->program->info->vs.outinfo;
9886
9887 if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
9888 ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
9889 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
9890 }
9891
9892 if (ctx->options->key.has_multiview_view_index) {
9893 ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
9894 ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
9895 }
9896
9897 /* the order these position exports are created is important */
9898 int next_pos = 0;
9899 bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
9900 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
9901 export_vs_psiz_layer_viewport(ctx, &next_pos);
9902 exported_pos = true;
9903 }
9904 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9905 exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
9906 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9907 exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
9908
9909 if (ctx->export_clip_dists) {
9910 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9911 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
9912 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9913 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
9914 }
9915
9916 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9917 if (i < VARYING_SLOT_VAR0 &&
9918 i != VARYING_SLOT_LAYER &&
9919 i != VARYING_SLOT_PRIMITIVE_ID &&
9920 i != VARYING_SLOT_VIEWPORT)
9921 continue;
9922
9923 export_vs_varying(ctx, i, false, NULL);
9924 }
9925
9926 if (!exported_pos)
9927 create_null_export(ctx);
9928 }
9929
9930 static bool export_fs_mrt_z(isel_context *ctx)
9931 {
9932 Builder bld(ctx->program, ctx->block);
9933 unsigned enabled_channels = 0;
9934 bool compr = false;
9935 Operand values[4];
9936
9937 for (unsigned i = 0; i < 4; ++i) {
9938 values[i] = Operand(v1);
9939 }
9940
9941 /* Both stencil and sample mask only need 16-bits. */
9942 if (!ctx->program->info->ps.writes_z &&
9943 (ctx->program->info->ps.writes_stencil ||
9944 ctx->program->info->ps.writes_sample_mask)) {
9945 compr = true; /* COMPR flag */
9946
9947 if (ctx->program->info->ps.writes_stencil) {
9948 /* Stencil should be in X[23:16]. */
9949 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
9950 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
9951 enabled_channels |= 0x3;
9952 }
9953
9954 if (ctx->program->info->ps.writes_sample_mask) {
9955 /* SampleMask should be in Y[15:0]. */
9956 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
9957 enabled_channels |= 0xc;
9958 }
9959 } else {
9960 if (ctx->program->info->ps.writes_z) {
9961 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
9962 enabled_channels |= 0x1;
9963 }
9964
9965 if (ctx->program->info->ps.writes_stencil) {
9966 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
9967 enabled_channels |= 0x2;
9968 }
9969
9970 if (ctx->program->info->ps.writes_sample_mask) {
9971 values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
9972 enabled_channels |= 0x4;
9973 }
9974 }
9975
9976 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
9977 * writemask component.
9978 */
9979 if (ctx->options->chip_class == GFX6 &&
9980 ctx->options->family != CHIP_OLAND &&
9981 ctx->options->family != CHIP_HAINAN) {
9982 enabled_channels |= 0x1;
9983 }
9984
9985 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
9986 enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
9987
9988 return true;
9989 }
9990
9991 static bool export_fs_mrt_color(isel_context *ctx, int slot)
9992 {
9993 Builder bld(ctx->program, ctx->block);
9994 unsigned write_mask = ctx->outputs.mask[slot];
9995 Operand values[4];
9996
9997 for (unsigned i = 0; i < 4; ++i) {
9998 if (write_mask & (1 << i)) {
9999 values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10000 } else {
10001 values[i] = Operand(v1);
10002 }
10003 }
10004
10005 unsigned target, col_format;
10006 unsigned enabled_channels = 0;
10007 aco_opcode compr_op = (aco_opcode)0;
10008
10009 slot -= FRAG_RESULT_DATA0;
10010 target = V_008DFC_SQ_EXP_MRT + slot;
10011 col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10012
10013 bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10014 bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10015 bool is_16bit = values[0].regClass() == v2b;
10016
10017 switch (col_format)
10018 {
10019 case V_028714_SPI_SHADER_ZERO:
10020 enabled_channels = 0; /* writemask */
10021 target = V_008DFC_SQ_EXP_NULL;
10022 break;
10023
10024 case V_028714_SPI_SHADER_32_R:
10025 enabled_channels = 1;
10026 break;
10027
10028 case V_028714_SPI_SHADER_32_GR:
10029 enabled_channels = 0x3;
10030 break;
10031
10032 case V_028714_SPI_SHADER_32_AR:
10033 if (ctx->options->chip_class >= GFX10) {
10034 /* Special case: on GFX10, the outputs are different for 32_AR */
10035 enabled_channels = 0x3;
10036 values[1] = values[3];
10037 values[3] = Operand(v1);
10038 } else {
10039 enabled_channels = 0x9;
10040 }
10041 break;
10042
10043 case V_028714_SPI_SHADER_FP16_ABGR:
10044 enabled_channels = 0x5;
10045 compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
10046 if (is_16bit) {
10047 if (ctx->options->chip_class >= GFX9) {
10048 /* Pack the FP16 values together instead of converting them to
10049 * FP32 and back to FP16.
10050 * TODO: use p_create_vector and let the compiler optimizes.
10051 */
10052 compr_op = aco_opcode::v_pack_b32_f16;
10053 } else {
10054 for (unsigned i = 0; i < 4; i++) {
10055 if ((write_mask >> i) & 1)
10056 values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
10057 }
10058 }
10059 }
10060 break;
10061
10062 case V_028714_SPI_SHADER_UNORM16_ABGR:
10063 enabled_channels = 0x5;
10064 if (is_16bit && ctx->options->chip_class >= GFX9) {
10065 compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10066 } else {
10067 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10068 }
10069 break;
10070
10071 case V_028714_SPI_SHADER_SNORM16_ABGR:
10072 enabled_channels = 0x5;
10073 if (is_16bit && ctx->options->chip_class >= GFX9) {
10074 compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10075 } else {
10076 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10077 }
10078 break;
10079
10080 case V_028714_SPI_SHADER_UINT16_ABGR: {
10081 enabled_channels = 0x5;
10082 compr_op = aco_opcode::v_cvt_pk_u16_u32;
10083 if (is_int8 || is_int10) {
10084 /* clamp */
10085 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10086 Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10087
10088 for (unsigned i = 0; i < 4; i++) {
10089 if ((write_mask >> i) & 1) {
10090 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10091 i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
10092 values[i]);
10093 }
10094 }
10095 } else if (is_16bit) {
10096 for (unsigned i = 0; i < 4; i++) {
10097 if ((write_mask >> i) & 1) {
10098 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10099 values[i] = Operand(tmp);
10100 }
10101 }
10102 }
10103 break;
10104 }
10105
10106 case V_028714_SPI_SHADER_SINT16_ABGR:
10107 enabled_channels = 0x5;
10108 compr_op = aco_opcode::v_cvt_pk_i16_i32;
10109 if (is_int8 || is_int10) {
10110 /* clamp */
10111 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10112 uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
10113 Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10114 Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
10115
10116 for (unsigned i = 0; i < 4; i++) {
10117 if ((write_mask >> i) & 1) {
10118 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
10119 i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
10120 values[i]);
10121 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
10122 i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
10123 values[i]);
10124 }
10125 }
10126 } else if (is_16bit) {
10127 for (unsigned i = 0; i < 4; i++) {
10128 if ((write_mask >> i) & 1) {
10129 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10130 values[i] = Operand(tmp);
10131 }
10132 }
10133 }
10134 break;
10135
10136 case V_028714_SPI_SHADER_32_ABGR:
10137 enabled_channels = 0xF;
10138 break;
10139
10140 default:
10141 break;
10142 }
10143
10144 if (target == V_008DFC_SQ_EXP_NULL)
10145 return false;
10146
10147 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10148 if (ctx->options->enable_mrt_output_nan_fixup &&
10149 !is_16bit &&
10150 (col_format == V_028714_SPI_SHADER_32_R ||
10151 col_format == V_028714_SPI_SHADER_32_GR ||
10152 col_format == V_028714_SPI_SHADER_32_AR ||
10153 col_format == V_028714_SPI_SHADER_32_ABGR ||
10154 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10155 for (int i = 0; i < 4; i++) {
10156 if (!(write_mask & (1 << i)))
10157 continue;
10158
10159 Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
10160 bld.hint_vcc(bld.def(bld.lm)), values[i],
10161 bld.copy(bld.def(v1), Operand(3u)));
10162 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10163 bld.copy(bld.def(v1), Operand(0u)), isnan);
10164 }
10165 }
10166
10167 if ((bool) compr_op) {
10168 for (int i = 0; i < 2; i++) {
10169 /* check if at least one of the values to be compressed is enabled */
10170 unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
10171 if (enabled) {
10172 enabled_channels |= enabled << (i*2);
10173 values[i] = bld.vop3(compr_op, bld.def(v1),
10174 values[i*2].isUndefined() ? Operand(0u) : values[i*2],
10175 values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
10176 } else {
10177 values[i] = Operand(v1);
10178 }
10179 }
10180 values[2] = Operand(v1);
10181 values[3] = Operand(v1);
10182 } else {
10183 for (int i = 0; i < 4; i++)
10184 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10185 }
10186
10187 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10188 enabled_channels, target, (bool) compr_op);
10189 return true;
10190 }
10191
10192 static void create_fs_exports(isel_context *ctx)
10193 {
10194 bool exported = false;
10195
10196 /* Export depth, stencil and sample mask. */
10197 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
10198 ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
10199 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10200 exported |= export_fs_mrt_z(ctx);
10201
10202 /* Export all color render targets. */
10203 for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
10204 if (ctx->outputs.mask[i])
10205 exported |= export_fs_mrt_color(ctx, i);
10206
10207 if (!exported)
10208 create_null_export(ctx);
10209 }
10210
10211 static void write_tcs_tess_factors(isel_context *ctx)
10212 {
10213 unsigned outer_comps;
10214 unsigned inner_comps;
10215
10216 switch (ctx->args->options->key.tcs.primitive_mode) {
10217 case GL_ISOLINES:
10218 outer_comps = 2;
10219 inner_comps = 0;
10220 break;
10221 case GL_TRIANGLES:
10222 outer_comps = 3;
10223 inner_comps = 1;
10224 break;
10225 case GL_QUADS:
10226 outer_comps = 4;
10227 inner_comps = 2;
10228 break;
10229 default:
10230 return;
10231 }
10232
10233 Builder bld(ctx->program, ctx->block);
10234
10235 bld.barrier(aco_opcode::p_memory_barrier_shared);
10236 if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
10237 bld.sopp(aco_opcode::s_barrier);
10238
10239 Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
10240 Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
10241
10242 Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
10243 if_context ic_invocation_id_is_zero;
10244 begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
10245 bld.reset(ctx->block);
10246
10247 Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
10248
10249 std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
10250 unsigned stride = inner_comps + outer_comps;
10251 unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
10252 Temp tf_inner_vec;
10253 Temp tf_outer_vec;
10254 Temp out[6];
10255 assert(stride <= (sizeof(out) / sizeof(Temp)));
10256
10257 if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
10258 // LINES reversal
10259 tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10260 out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
10261 out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
10262 } else {
10263 tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10264 tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
10265
10266 for (unsigned i = 0; i < outer_comps; ++i)
10267 out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
10268 for (unsigned i = 0; i < inner_comps; ++i)
10269 out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
10270 }
10271
10272 Temp rel_patch_id = get_tess_rel_patch_id(ctx);
10273 Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
10274 Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u);
10275 unsigned tf_const_offset = 0;
10276
10277 if (ctx->program->chip_class <= GFX8) {
10278 Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
10279 if_context ic_rel_patch_id_is_zero;
10280 begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
10281 bld.reset(ctx->block);
10282
10283 /* Store the dynamic HS control word. */
10284 Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
10285 bld.mubuf(aco_opcode::buffer_store_dword,
10286 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
10287 /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
10288 /* disable_wqm */ false, /* glc */ true);
10289 tf_const_offset += 4;
10290
10291 begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
10292 end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
10293 bld.reset(ctx->block);
10294 }
10295
10296 assert(stride == 2 || stride == 4 || stride == 6);
10297 Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
10298 store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
10299
10300 /* Store to offchip for TES to read - only if TES reads them */
10301 if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
10302 Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
10303 Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
10304
10305 std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
10306 store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
10307
10308 if (likely(inner_comps)) {
10309 std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
10310 store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
10311 }
10312 }
10313
10314 begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
10315 end_divergent_if(ctx, &ic_invocation_id_is_zero);
10316 }
10317
10318 static void emit_stream_output(isel_context *ctx,
10319 Temp const *so_buffers,
10320 Temp const *so_write_offset,
10321 const struct radv_stream_output *output)
10322 {
10323 unsigned num_comps = util_bitcount(output->component_mask);
10324 unsigned writemask = (1 << num_comps) - 1;
10325 unsigned loc = output->location;
10326 unsigned buf = output->buffer;
10327
10328 assert(num_comps && num_comps <= 4);
10329 if (!num_comps || num_comps > 4)
10330 return;
10331
10332 unsigned start = ffs(output->component_mask) - 1;
10333
10334 Temp out[4];
10335 bool all_undef = true;
10336 assert(ctx->stage & hw_vs);
10337 for (unsigned i = 0; i < num_comps; i++) {
10338 out[i] = ctx->outputs.temps[loc * 4 + start + i];
10339 all_undef = all_undef && !out[i].id();
10340 }
10341 if (all_undef)
10342 return;
10343
10344 while (writemask) {
10345 int start, count;
10346 u_bit_scan_consecutive_range(&writemask, &start, &count);
10347 if (count == 3 && ctx->options->chip_class == GFX6) {
10348 /* GFX6 doesn't support storing vec3, split it. */
10349 writemask |= 1u << (start + 2);
10350 count = 2;
10351 }
10352
10353 unsigned offset = output->offset + start * 4;
10354
10355 Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
10356 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
10357 for (int i = 0; i < count; ++i)
10358 vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
10359 vec->definitions[0] = Definition(write_data);
10360 ctx->block->instructions.emplace_back(std::move(vec));
10361
10362 aco_opcode opcode;
10363 switch (count) {
10364 case 1:
10365 opcode = aco_opcode::buffer_store_dword;
10366 break;
10367 case 2:
10368 opcode = aco_opcode::buffer_store_dwordx2;
10369 break;
10370 case 3:
10371 opcode = aco_opcode::buffer_store_dwordx3;
10372 break;
10373 case 4:
10374 opcode = aco_opcode::buffer_store_dwordx4;
10375 break;
10376 default:
10377 unreachable("Unsupported dword count.");
10378 }
10379
10380 aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
10381 store->operands[0] = Operand(so_buffers[buf]);
10382 store->operands[1] = Operand(so_write_offset[buf]);
10383 store->operands[2] = Operand((uint32_t) 0);
10384 store->operands[3] = Operand(write_data);
10385 if (offset > 4095) {
10386 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
10387 Builder bld(ctx->program, ctx->block);
10388 store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
10389 } else {
10390 store->offset = offset;
10391 }
10392 store->offen = true;
10393 store->glc = true;
10394 store->dlc = false;
10395 store->slc = true;
10396 store->can_reorder = true;
10397 ctx->block->instructions.emplace_back(std::move(store));
10398 }
10399 }
10400
10401 static void emit_streamout(isel_context *ctx, unsigned stream)
10402 {
10403 Builder bld(ctx->program, ctx->block);
10404
10405 Temp so_buffers[4];
10406 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
10407 for (unsigned i = 0; i < 4; i++) {
10408 unsigned stride = ctx->program->info->so.strides[i];
10409 if (!stride)
10410 continue;
10411
10412 Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
10413 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
10414 }
10415
10416 Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10417 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
10418
10419 Temp tid = emit_mbcnt(ctx, bld.def(v1));
10420
10421 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
10422
10423 if_context ic;
10424 begin_divergent_if_then(ctx, &ic, can_emit);
10425
10426 bld.reset(ctx->block);
10427
10428 Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
10429
10430 Temp so_write_offset[4];
10431
10432 for (unsigned i = 0; i < 4; i++) {
10433 unsigned stride = ctx->program->info->so.strides[i];
10434 if (!stride)
10435 continue;
10436
10437 if (stride == 1) {
10438 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10439 get_arg(ctx, ctx->args->streamout_write_idx),
10440 get_arg(ctx, ctx->args->streamout_offset[i]));
10441 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
10442
10443 so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
10444 } else {
10445 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
10446 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
10447 get_arg(ctx, ctx->args->streamout_offset[i]));
10448 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
10449 }
10450 }
10451
10452 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
10453 struct radv_stream_output *output =
10454 &ctx->program->info->so.outputs[i];
10455 if (stream != output->stream)
10456 continue;
10457
10458 emit_stream_output(ctx, so_buffers, so_write_offset, output);
10459 }
10460
10461 begin_divergent_if_else(ctx, &ic);
10462 end_divergent_if(ctx, &ic);
10463 }
10464
10465 } /* end namespace */
10466
10467 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
10468 {
10469 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
10470 Builder bld(ctx->program, ctx->block);
10471 constexpr unsigned hs_idx = 1u;
10472 Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10473 get_arg(ctx, ctx->args->merged_wave_info),
10474 Operand((8u << 16) | (hs_idx * 8u)));
10475 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
10476
10477 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
10478
10479 Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10480 get_arg(ctx, ctx->args->rel_auto_id),
10481 get_arg(ctx, ctx->args->ac.instance_id),
10482 ls_has_nonzero_hs_threads);
10483 Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10484 get_arg(ctx, ctx->args->ac.tcs_rel_ids),
10485 get_arg(ctx, ctx->args->rel_auto_id),
10486 ls_has_nonzero_hs_threads);
10487 Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10488 get_arg(ctx, ctx->args->ac.tcs_patch_id),
10489 get_arg(ctx, ctx->args->ac.vertex_id),
10490 ls_has_nonzero_hs_threads);
10491
10492 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
10493 ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
10494 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
10495 }
10496
10497 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
10498 {
10499 /* Split all arguments except for the first (ring_offsets) and the last
10500 * (exec) so that the dead channels don't stay live throughout the program.
10501 */
10502 for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
10503 if (startpgm->definitions[i].regClass().size() > 1) {
10504 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10505 startpgm->definitions[i].regClass().size());
10506 }
10507 }
10508 }
10509
10510 void handle_bc_optimize(isel_context *ctx)
10511 {
10512 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
10513 Builder bld(ctx->program, ctx->block);
10514 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
10515 bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
10516 bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
10517 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
10518 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
10519 if (uses_center && uses_centroid) {
10520 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
10521 get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
10522
10523 if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
10524 Temp new_coord[2];
10525 for (unsigned i = 0; i < 2; i++) {
10526 Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
10527 Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
10528 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10529 persp_centroid, persp_center, sel);
10530 }
10531 ctx->persp_centroid = bld.tmp(v2);
10532 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
10533 Operand(new_coord[0]), Operand(new_coord[1]));
10534 emit_split_vector(ctx, ctx->persp_centroid, 2);
10535 }
10536
10537 if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
10538 Temp new_coord[2];
10539 for (unsigned i = 0; i < 2; i++) {
10540 Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
10541 Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
10542 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10543 linear_centroid, linear_center, sel);
10544 }
10545 ctx->linear_centroid = bld.tmp(v2);
10546 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
10547 Operand(new_coord[0]), Operand(new_coord[1]));
10548 emit_split_vector(ctx, ctx->linear_centroid, 2);
10549 }
10550 }
10551 }
10552
10553 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
10554 {
10555 Program *program = ctx->program;
10556
10557 unsigned float_controls = shader->info.float_controls_execution_mode;
10558
10559 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
10560 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
10561 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
10562 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
10563 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
10564
10565 program->next_fp_mode.must_flush_denorms32 =
10566 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10567 program->next_fp_mode.must_flush_denorms16_64 =
10568 float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
10569 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10570
10571 program->next_fp_mode.care_about_round32 =
10572 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10573
10574 program->next_fp_mode.care_about_round16_64 =
10575 float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10576 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10577
10578 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10579 * the precision seems needed for Wolfenstein: Youngblood to render correctly */
10580 if (program->next_fp_mode.must_flush_denorms16_64)
10581 program->next_fp_mode.denorm16_64 = 0;
10582 else
10583 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10584
10585 /* preserving fp32 denorms is expensive, so only do it if asked */
10586 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
10587 program->next_fp_mode.denorm32 = fp_denorm_keep;
10588 else
10589 program->next_fp_mode.denorm32 = 0;
10590
10591 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
10592 program->next_fp_mode.round32 = fp_round_tz;
10593 else
10594 program->next_fp_mode.round32 = fp_round_ne;
10595
10596 if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
10597 program->next_fp_mode.round16_64 = fp_round_tz;
10598 else
10599 program->next_fp_mode.round16_64 = fp_round_ne;
10600
10601 ctx->block->fp_mode = program->next_fp_mode;
10602 }
10603
10604 void cleanup_cfg(Program *program)
10605 {
10606 /* create linear_succs/logical_succs */
10607 for (Block& BB : program->blocks) {
10608 for (unsigned idx : BB.linear_preds)
10609 program->blocks[idx].linear_succs.emplace_back(BB.index);
10610 for (unsigned idx : BB.logical_preds)
10611 program->blocks[idx].logical_succs.emplace_back(BB.index);
10612 }
10613 }
10614
10615 Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
10616 {
10617 Builder bld(ctx->program, ctx->block);
10618
10619 /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
10620 Temp count = i == 0
10621 ? get_arg(ctx, ctx->args->merged_wave_info)
10622 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
10623 get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
10624
10625 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
10626 Temp cond;
10627
10628 if (ctx->program->wave_size == 64) {
10629 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
10630 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
10631 cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
10632 } else {
10633 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
10634 cond = emit_extract_vector(ctx, mask, 0, bld.lm);
10635 }
10636
10637 return cond;
10638 }
10639
10640 bool ngg_early_prim_export(isel_context *ctx)
10641 {
10642 /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
10643 return true;
10644 }
10645
10646 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
10647 {
10648 Builder bld(ctx->program, ctx->block);
10649
10650 /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
10651 bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
10652
10653 /* Get the id of the current wave within the threadgroup (workgroup) */
10654 Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10655 get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10656
10657 /* Execute the following code only on the first wave (wave id 0),
10658 * use the SCC def to tell if the wave id is zero or not.
10659 */
10660 Temp cond = wave_id_in_tg.def(1).getTemp();
10661 if_context ic;
10662 begin_uniform_if_then(ctx, &ic, cond);
10663 begin_uniform_if_else(ctx, &ic);
10664 bld.reset(ctx->block);
10665
10666 /* Number of vertices output by VS/TES */
10667 Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10668 get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
10669 /* Number of primitives output by VS/TES */
10670 Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10671 get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
10672
10673 /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
10674 Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
10675 tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
10676
10677 /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
10678 bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
10679
10680 end_uniform_if(ctx, &ic);
10681
10682 /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
10683 bld.reset(ctx->block);
10684 bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
10685 }
10686
10687 Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
10688 {
10689 Builder bld(ctx->program, ctx->block);
10690
10691 if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
10692 return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
10693 }
10694
10695 Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
10696 Temp tmp;
10697
10698 for (unsigned i = 0; i < num_vertices; ++i) {
10699 assert(vtxindex[i].id());
10700
10701 if (i)
10702 tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
10703 else
10704 tmp = vtxindex[i];
10705
10706 /* The initial edge flag is always false in tess eval shaders. */
10707 if (ctx->stage == ngg_vertex_gs) {
10708 Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
10709 tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
10710 }
10711 }
10712
10713 /* TODO: Set isnull field in case of merged NGG VS+GS. */
10714
10715 return tmp;
10716 }
10717
10718 void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
10719 {
10720 Builder bld(ctx->program, ctx->block);
10721 Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
10722
10723 bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
10724 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
10725 false /* compressed */, true/* done */, false /* valid mask */);
10726 }
10727
10728 void ngg_emit_nogs_gsthreads(isel_context *ctx)
10729 {
10730 /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
10731 * These must always come before VS exports.
10732 *
10733 * It is recommended to do these as early as possible. They can be at the beginning when
10734 * there is no SW GS and the shader doesn't write edge flags.
10735 */
10736
10737 if_context ic;
10738 Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
10739 begin_divergent_if_then(ctx, &ic, is_gs_thread);
10740
10741 Builder bld(ctx->program, ctx->block);
10742 constexpr unsigned max_vertices_per_primitive = 3;
10743 unsigned num_vertices_per_primitive = max_vertices_per_primitive;
10744
10745 if (ctx->stage == ngg_vertex_gs) {
10746 /* TODO: optimize for points & lines */
10747 } else if (ctx->stage == ngg_tess_eval_gs) {
10748 if (ctx->shader->info.tess.point_mode)
10749 num_vertices_per_primitive = 1;
10750 else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
10751 num_vertices_per_primitive = 2;
10752 } else {
10753 unreachable("Unsupported NGG shader stage");
10754 }
10755
10756 Temp vtxindex[max_vertices_per_primitive];
10757 vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10758 get_arg(ctx, ctx->args->gs_vtx_offset[0]));
10759 vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
10760 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
10761 get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
10762 vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
10763 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10764 get_arg(ctx, ctx->args->gs_vtx_offset[2]));
10765
10766 /* Export primitive data to the index buffer. */
10767 ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
10768
10769 /* Export primitive ID. */
10770 if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
10771 /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
10772 Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
10773 Temp provoking_vtx_index = vtxindex[0];
10774 Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
10775
10776 store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
10777 }
10778
10779 begin_divergent_if_else(ctx, &ic);
10780 end_divergent_if(ctx, &ic);
10781 }
10782
10783 void ngg_emit_nogs_output(isel_context *ctx)
10784 {
10785 /* Emits NGG GS output, for stages that don't have SW GS. */
10786
10787 if_context ic;
10788 Builder bld(ctx->program, ctx->block);
10789 bool late_prim_export = !ngg_early_prim_export(ctx);
10790
10791 /* NGG streamout is currently disabled by default. */
10792 assert(!ctx->args->shader_info->so.num_outputs);
10793
10794 if (late_prim_export) {
10795 /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
10796 create_export_phis(ctx);
10797 /* Do what we need to do in the GS threads. */
10798 ngg_emit_nogs_gsthreads(ctx);
10799
10800 /* What comes next should be executed on ES threads. */
10801 Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
10802 begin_divergent_if_then(ctx, &ic, is_es_thread);
10803 bld.reset(ctx->block);
10804 }
10805
10806 /* Export VS outputs */
10807 ctx->block->kind |= block_kind_export_end;
10808 create_vs_exports(ctx);
10809
10810 /* Export primitive ID */
10811 if (ctx->args->options->key.vs_common_out.export_prim_id) {
10812 Temp prim_id;
10813
10814 if (ctx->stage == ngg_vertex_gs) {
10815 /* Wait for GS threads to store primitive ID in LDS. */
10816 bld.barrier(aco_opcode::p_memory_barrier_shared);
10817 bld.sopp(aco_opcode::s_barrier);
10818
10819 /* Calculate LDS address where the GS threads stored the primitive ID. */
10820 Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10821 get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10822 Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
10823 Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
10824 Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
10825 Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
10826
10827 /* Load primitive ID from LDS. */
10828 prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
10829 } else if (ctx->stage == ngg_tess_eval_gs) {
10830 /* TES: Just use the patch ID as the primitive ID. */
10831 prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
10832 } else {
10833 unreachable("unsupported NGG shader stage.");
10834 }
10835
10836 ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10837 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
10838
10839 export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
10840 }
10841
10842 if (late_prim_export) {
10843 begin_divergent_if_else(ctx, &ic);
10844 end_divergent_if(ctx, &ic);
10845 bld.reset(ctx->block);
10846 }
10847 }
10848
10849 void select_program(Program *program,
10850 unsigned shader_count,
10851 struct nir_shader *const *shaders,
10852 ac_shader_config* config,
10853 struct radv_shader_args *args)
10854 {
10855 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
10856 if_context ic_merged_wave_info;
10857 bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
10858
10859 for (unsigned i = 0; i < shader_count; i++) {
10860 nir_shader *nir = shaders[i];
10861 init_context(&ctx, nir);
10862
10863 setup_fp_mode(&ctx, nir);
10864
10865 if (!i) {
10866 /* needs to be after init_context() for FS */
10867 Pseudo_instruction *startpgm = add_startpgm(&ctx);
10868 append_logical_start(ctx.block);
10869
10870 if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
10871 fix_ls_vgpr_init_bug(&ctx, startpgm);
10872
10873 split_arguments(&ctx, startpgm);
10874 }
10875
10876 if (ngg_no_gs) {
10877 ngg_emit_sendmsg_gs_alloc_req(&ctx);
10878
10879 if (ngg_early_prim_export(&ctx))
10880 ngg_emit_nogs_gsthreads(&ctx);
10881 }
10882
10883 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
10884 nir_function_impl *func = nir_shader_get_entrypoint(nir);
10885 bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
10886 ((nir->info.stage == MESA_SHADER_VERTEX &&
10887 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
10888 (nir->info.stage == MESA_SHADER_TESS_EVAL &&
10889 ctx.stage == tess_eval_geometry_gs));
10890
10891 bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
10892 bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
10893 if (check_merged_wave_info) {
10894 Temp cond = merged_wave_info_to_mask(&ctx, i);
10895 begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
10896 }
10897
10898 if (i) {
10899 Builder bld(ctx.program, ctx.block);
10900
10901 bld.barrier(aco_opcode::p_memory_barrier_shared);
10902 bld.sopp(aco_opcode::s_barrier);
10903
10904 if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
10905 ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
10906 }
10907 } else if (ctx.stage == geometry_gs)
10908 ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
10909
10910 if (ctx.stage == fragment_fs)
10911 handle_bc_optimize(&ctx);
10912
10913 visit_cf_list(&ctx, &func->body);
10914
10915 if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
10916 emit_streamout(&ctx, 0);
10917
10918 if (ctx.stage & hw_vs) {
10919 create_vs_exports(&ctx);
10920 ctx.block->kind |= block_kind_export_end;
10921 } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
10922 ngg_emit_nogs_output(&ctx);
10923 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
10924 Builder bld(ctx.program, ctx.block);
10925 bld.barrier(aco_opcode::p_memory_barrier_gs_data);
10926 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
10927 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
10928 write_tcs_tess_factors(&ctx);
10929 }
10930
10931 if (ctx.stage == fragment_fs) {
10932 create_fs_exports(&ctx);
10933 ctx.block->kind |= block_kind_export_end;
10934 }
10935
10936 if (endif_merged_wave_info) {
10937 begin_divergent_if_else(&ctx, &ic_merged_wave_info);
10938 end_divergent_if(&ctx, &ic_merged_wave_info);
10939 }
10940
10941 if (ngg_no_gs && !ngg_early_prim_export(&ctx))
10942 ngg_emit_nogs_output(&ctx);
10943
10944 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
10945 /* Outputs of the previous stage are inputs to the next stage */
10946 ctx.inputs = ctx.outputs;
10947 ctx.outputs = shader_io_state();
10948 }
10949 }
10950
10951 program->config->float_mode = program->blocks[0].fp_mode.val;
10952
10953 append_logical_end(ctx.block);
10954 ctx.block->kind |= block_kind_uniform;
10955 Builder bld(ctx.program, ctx.block);
10956 if (ctx.program->wb_smem_l1_on_end)
10957 bld.smem(aco_opcode::s_dcache_wb, false);
10958 bld.sopp(aco_opcode::s_endpgm);
10959
10960 cleanup_cfg(program);
10961 }
10962
10963 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
10964 ac_shader_config* config,
10965 struct radv_shader_args *args)
10966 {
10967 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
10968
10969 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
10970 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
10971 program->next_fp_mode.must_flush_denorms32 = false;
10972 program->next_fp_mode.must_flush_denorms16_64 = false;
10973 program->next_fp_mode.care_about_round32 = false;
10974 program->next_fp_mode.care_about_round16_64 = false;
10975 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10976 program->next_fp_mode.denorm32 = 0;
10977 program->next_fp_mode.round32 = fp_round_ne;
10978 program->next_fp_mode.round16_64 = fp_round_ne;
10979 ctx.block->fp_mode = program->next_fp_mode;
10980
10981 add_startpgm(&ctx);
10982 append_logical_start(ctx.block);
10983
10984 Builder bld(ctx.program, ctx.block);
10985
10986 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
10987
10988 Operand stream_id(0u);
10989 if (args->shader_info->so.num_outputs)
10990 stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10991 get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
10992
10993 Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
10994
10995 std::stack<Block> endif_blocks;
10996
10997 for (unsigned stream = 0; stream < 4; stream++) {
10998 if (stream_id.isConstant() && stream != stream_id.constantValue())
10999 continue;
11000
11001 unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11002 if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11003 continue;
11004
11005 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11006
11007 unsigned BB_if_idx = ctx.block->index;
11008 Block BB_endif = Block();
11009 if (!stream_id.isConstant()) {
11010 /* begin IF */
11011 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
11012 append_logical_end(ctx.block);
11013 ctx.block->kind |= block_kind_uniform;
11014 bld.branch(aco_opcode::p_cbranch_z, cond);
11015
11016 BB_endif.kind |= ctx.block->kind & block_kind_top_level;
11017
11018 ctx.block = ctx.program->create_and_insert_block();
11019 add_edge(BB_if_idx, ctx.block);
11020 bld.reset(ctx.block);
11021 append_logical_start(ctx.block);
11022 }
11023
11024 unsigned offset = 0;
11025 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11026 if (args->shader_info->gs.output_streams[i] != stream)
11027 continue;
11028
11029 unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11030 unsigned length = util_last_bit(output_usage_mask);
11031 for (unsigned j = 0; j < length; ++j) {
11032 if (!(output_usage_mask & (1 << j)))
11033 continue;
11034
11035 unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11036 Temp voffset = vtx_offset;
11037 if (const_offset >= 4096u) {
11038 voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
11039 const_offset %= 4096u;
11040 }
11041
11042 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
11043 mubuf->definitions[0] = bld.def(v1);
11044 mubuf->operands[0] = Operand(gsvs_ring);
11045 mubuf->operands[1] = Operand(voffset);
11046 mubuf->operands[2] = Operand(0u);
11047 mubuf->offen = true;
11048 mubuf->offset = const_offset;
11049 mubuf->glc = true;
11050 mubuf->slc = true;
11051 mubuf->dlc = args->options->chip_class >= GFX10;
11052 mubuf->barrier = barrier_none;
11053 mubuf->can_reorder = true;
11054
11055 ctx.outputs.mask[i] |= 1 << j;
11056 ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
11057
11058 bld.insert(std::move(mubuf));
11059
11060 offset++;
11061 }
11062 }
11063
11064 if (args->shader_info->so.num_outputs) {
11065 emit_streamout(&ctx, stream);
11066 bld.reset(ctx.block);
11067 }
11068
11069 if (stream == 0) {
11070 create_vs_exports(&ctx);
11071 ctx.block->kind |= block_kind_export_end;
11072 }
11073
11074 if (!stream_id.isConstant()) {
11075 append_logical_end(ctx.block);
11076
11077 /* branch from then block to endif block */
11078 bld.branch(aco_opcode::p_branch);
11079 add_edge(ctx.block->index, &BB_endif);
11080 ctx.block->kind |= block_kind_uniform;
11081
11082 /* emit else block */
11083 ctx.block = ctx.program->create_and_insert_block();
11084 add_edge(BB_if_idx, ctx.block);
11085 bld.reset(ctx.block);
11086 append_logical_start(ctx.block);
11087
11088 endif_blocks.push(std::move(BB_endif));
11089 }
11090 }
11091
11092 while (!endif_blocks.empty()) {
11093 Block BB_endif = std::move(endif_blocks.top());
11094 endif_blocks.pop();
11095
11096 Block *BB_else = ctx.block;
11097
11098 append_logical_end(BB_else);
11099 /* branch from else block to endif block */
11100 bld.branch(aco_opcode::p_branch);
11101 add_edge(BB_else->index, &BB_endif);
11102 BB_else->kind |= block_kind_uniform;
11103
11104 /** emit endif merge block */
11105 ctx.block = program->insert_block(std::move(BB_endif));
11106 bld.reset(ctx.block);
11107 append_logical_start(ctx.block);
11108 }
11109
11110 program->config->float_mode = program->blocks[0].fp_mode.val;
11111
11112 append_logical_end(ctx.block);
11113 ctx.block->kind |= block_kind_uniform;
11114 bld.sopp(aco_opcode::s_endpgm);
11115
11116 cleanup_cfg(program);
11117 }
11118 }