ir3: Plumb through bindless support
[mesa.git] / src / freedreno / ir3 / ir3_a6xx.c
1 /*
2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #define GPU 600
28
29 #include "ir3_context.h"
30 #include "ir3_image.h"
31
32 /*
33 * Handlers for instructions changed/added in a6xx:
34 *
35 * Starting with a6xx, isam and stbi is used for SSBOs as well; stbi and the
36 * atomic instructions (used for both SSBO and image) use a new instruction
37 * encoding compared to a4xx/a5xx.
38 */
39
40 static void
41 handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
42 {
43 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
44 if (!intrin)
45 return;
46
47 instr->flags |= IR3_INSTR_B;
48 instr->cat6.base = nir_intrinsic_desc_set(intrin);
49 }
50
51 static struct ir3_instruction *
52 ssbo_idx(struct ir3_context *ctx, nir_src src)
53 {
54 if (ir3_bindless_resource(src)) {
55 ctx->so->bindless_ibo = true;
56 return ir3_get_src(ctx, &src)[0];
57 } else {
58 /* can this be non-const buffer_index? how do we handle that? */
59 int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(src));
60 return create_immed(ctx->block, ibo_idx);
61 }
62 }
63
64 static struct ir3_instruction *
65 image_idx(struct ir3_context *ctx, nir_src src)
66 {
67 if (ir3_bindless_resource(src)) {
68 ctx->so->bindless_ibo = true;
69 return ir3_get_src(ctx, &src)[0];
70 } else {
71 /* can this be non-const buffer_index? how do we handle that? */
72 int ibo_idx = ir3_image_to_ibo(ctx->so->shader, nir_src_as_uint(src));
73 return create_immed(ctx->block, ibo_idx);
74 }
75 }
76
77 /* src[] = { buffer_index, offset }. No const_index */
78 static void
79 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
80 struct ir3_instruction **dst)
81 {
82 struct ir3_block *b = ctx->block;
83 struct ir3_instruction *offset;
84 struct ir3_instruction *ldib;
85
86 offset = ir3_get_src(ctx, &intr->src[2])[0];
87
88 ldib = ir3_LDIB(b, ssbo_idx(ctx, intr->src[0]), 0, offset, 0);
89 ldib->regs[0]->wrmask = MASK(intr->num_components);
90 ldib->cat6.iim_val = intr->num_components;
91 ldib->cat6.d = 1;
92 ldib->cat6.type = TYPE_U32;
93 ldib->barrier_class = IR3_BARRIER_BUFFER_R;
94 ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
95 handle_bindless_cat6(ldib, intr->src[0]);
96
97 ir3_split_dest(b, dst, ldib, 0, intr->num_components);
98 }
99
100 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
101 static void
102 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
103 {
104 struct ir3_block *b = ctx->block;
105 struct ir3_instruction *stib, *val, *offset;
106 /* TODO handle wrmask properly, see _store_shared().. but I think
107 * it is more a PITA than that, since blob ends up loading the
108 * masked components and writing them back out.
109 */
110 unsigned wrmask = intr->const_index[0];
111 unsigned ncomp = ffs(~wrmask) - 1;
112
113 /* src0 is offset, src1 is value:
114 */
115 val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
116 offset = ir3_get_src(ctx, &intr->src[3])[0];
117
118 stib = ir3_STIB(b, ssbo_idx(ctx, intr->src[1]), 0, offset, 0, val, 0);
119 stib->cat6.iim_val = ncomp;
120 stib->cat6.d = 1;
121 stib->cat6.type = TYPE_U32;
122 stib->barrier_class = IR3_BARRIER_BUFFER_W;
123 stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
124 handle_bindless_cat6(stib, intr->src[1]);
125
126 array_insert(b, b->keeps, stib);
127 }
128
129 /*
130 * SSBO atomic intrinsics
131 *
132 * All of the SSBO atomic memory operations read a value from memory,
133 * compute a new value using one of the operations below, write the new
134 * value to memory, and return the original value read.
135 *
136 * All operations take 3 sources except CompSwap that takes 4. These
137 * sources represent:
138 *
139 * 0: The SSBO buffer index.
140 * 1: The offset into the SSBO buffer of the variable that the atomic
141 * operation will operate on.
142 * 2: The data parameter to the atomic function (i.e. the value to add
143 * in ssbo_atomic_add, etc).
144 * 3: For CompSwap only: the second data parameter.
145 */
146 static struct ir3_instruction *
147 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
148 {
149 struct ir3_block *b = ctx->block;
150 struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
151 type_t type = TYPE_U32;
152
153 ibo = ssbo_idx(ctx, intr->src[0]);
154
155 data = ir3_get_src(ctx, &intr->src[2])[0];
156
157 /* So this gets a bit creative:
158 *
159 * src0 - vecN offset/coords
160 * src1.x - is actually destination register
161 * src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
162 * src1.z - is 'data' for cmpxchg
163 *
164 * The combining src and dest kinda doesn't work out so well with how
165 * scheduling and RA work. So for now we create a dummy src2.x, and
166 * then in a later fixup path, insert an extra MOV out of src1.x.
167 * See ir3_a6xx_fixup_atomic_dests().
168 *
169 * Note that nir already multiplies the offset by four
170 */
171 dummy = create_immed(b, 0);
172
173 if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
174 src0 = ir3_get_src(ctx, &intr->src[4])[0];
175 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
176 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
177 dummy, compare, data
178 }, 3);
179 } else {
180 src0 = ir3_get_src(ctx, &intr->src[3])[0];
181 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
182 dummy, data
183 }, 2);
184 }
185
186 switch (intr->intrinsic) {
187 case nir_intrinsic_ssbo_atomic_add_ir3:
188 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
189 break;
190 case nir_intrinsic_ssbo_atomic_imin_ir3:
191 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
192 type = TYPE_S32;
193 break;
194 case nir_intrinsic_ssbo_atomic_umin_ir3:
195 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
196 break;
197 case nir_intrinsic_ssbo_atomic_imax_ir3:
198 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
199 type = TYPE_S32;
200 break;
201 case nir_intrinsic_ssbo_atomic_umax_ir3:
202 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
203 break;
204 case nir_intrinsic_ssbo_atomic_and_ir3:
205 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
206 break;
207 case nir_intrinsic_ssbo_atomic_or_ir3:
208 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
209 break;
210 case nir_intrinsic_ssbo_atomic_xor_ir3:
211 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
212 break;
213 case nir_intrinsic_ssbo_atomic_exchange_ir3:
214 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
215 break;
216 case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
217 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
218 break;
219 default:
220 unreachable("boo");
221 }
222
223 atomic->cat6.iim_val = 1;
224 atomic->cat6.d = 1;
225 atomic->cat6.type = type;
226 atomic->barrier_class = IR3_BARRIER_BUFFER_W;
227 atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
228 handle_bindless_cat6(atomic, intr->src[0]);
229
230 /* even if nothing consume the result, we can't DCE the instruction: */
231 array_insert(b, b->keeps, atomic);
232
233 return atomic;
234 }
235
236 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
237 static void
238 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
239 struct ir3_instruction **dst)
240 {
241 struct ir3_block *b = ctx->block;
242 struct ir3_instruction *ldib;
243 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
244 unsigned ncoords = ir3_get_image_coords(intr, NULL);
245
246 ldib = ir3_LDIB(b, image_idx(ctx, intr->src[0]), 0,
247 ir3_create_collect(ctx, coords, ncoords), 0);
248 ldib->regs[0]->wrmask = MASK(intr->num_components);
249 ldib->cat6.iim_val = intr->num_components;
250 ldib->cat6.d = ncoords;
251 ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
252 ldib->cat6.typed = true;
253 ldib->barrier_class = IR3_BARRIER_IMAGE_R;
254 ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
255 handle_bindless_cat6(ldib, intr->src[0]);
256
257 ir3_split_dest(b, dst, ldib, 0, intr->num_components);
258 }
259
260 /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
261 static void
262 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
263 {
264 struct ir3_block *b = ctx->block;
265 struct ir3_instruction *stib;
266 struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
267 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
268 unsigned ncoords = ir3_get_image_coords(intr, NULL);
269 enum pipe_format format = nir_intrinsic_format(intr);
270 unsigned ncomp = ir3_get_num_components_for_image_format(format);
271
272 /* src0 is offset, src1 is value:
273 */
274 stib = ir3_STIB(b, image_idx(ctx, intr->src[0]), 0,
275 ir3_create_collect(ctx, coords, ncoords), 0,
276 ir3_create_collect(ctx, value, ncomp), 0);
277 stib->cat6.iim_val = ncomp;
278 stib->cat6.d = ncoords;
279 stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
280 stib->cat6.typed = true;
281 stib->barrier_class = IR3_BARRIER_IMAGE_W;
282 stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
283 handle_bindless_cat6(stib, intr->src[0]);
284
285 array_insert(b, b->keeps, stib);
286 }
287
288 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
289 static struct ir3_instruction *
290 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
291 {
292 struct ir3_block *b = ctx->block;
293 struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
294 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
295 struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
296 unsigned ncoords = ir3_get_image_coords(intr, NULL);
297
298 ibo = image_idx(ctx, intr->src[0]);
299
300 /* So this gets a bit creative:
301 *
302 * src0 - vecN offset/coords
303 * src1.x - is actually destination register
304 * src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
305 * src1.z - is 'value' for cmpxchg
306 *
307 * The combining src and dest kinda doesn't work out so well with how
308 * scheduling and RA work. So for now we create a dummy src2.x, and
309 * then in a later fixup path, insert an extra MOV out of src1.x.
310 * See ir3_a6xx_fixup_atomic_dests().
311 */
312 dummy = create_immed(b, 0);
313 src0 = ir3_create_collect(ctx, coords, ncoords);
314
315 if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
316 intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
317 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
318 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
319 dummy, compare, value
320 }, 3);
321 } else {
322 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
323 dummy, value
324 }, 2);
325 }
326
327 switch (intr->intrinsic) {
328 case nir_intrinsic_image_atomic_add:
329 case nir_intrinsic_bindless_image_atomic_add:
330 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
331 break;
332 case nir_intrinsic_image_atomic_imin:
333 case nir_intrinsic_image_atomic_umin:
334 case nir_intrinsic_bindless_image_atomic_imin:
335 case nir_intrinsic_bindless_image_atomic_umin:
336 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
337 break;
338 case nir_intrinsic_image_atomic_imax:
339 case nir_intrinsic_image_atomic_umax:
340 case nir_intrinsic_bindless_image_atomic_imax:
341 case nir_intrinsic_bindless_image_atomic_umax:
342 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
343 break;
344 case nir_intrinsic_image_atomic_and:
345 case nir_intrinsic_bindless_image_atomic_and:
346 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
347 break;
348 case nir_intrinsic_image_atomic_or:
349 case nir_intrinsic_bindless_image_atomic_or:
350 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
351 break;
352 case nir_intrinsic_image_atomic_xor:
353 case nir_intrinsic_bindless_image_atomic_xor:
354 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
355 break;
356 case nir_intrinsic_image_atomic_exchange:
357 case nir_intrinsic_bindless_image_atomic_exchange:
358 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
359 break;
360 case nir_intrinsic_image_atomic_comp_swap:
361 case nir_intrinsic_bindless_image_atomic_comp_swap:
362 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
363 break;
364 default:
365 unreachable("boo");
366 }
367
368 atomic->cat6.iim_val = 1;
369 atomic->cat6.d = ncoords;
370 atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
371 atomic->cat6.typed = true;
372 atomic->barrier_class = IR3_BARRIER_IMAGE_W;
373 atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
374 handle_bindless_cat6(atomic, intr->src[0]);
375
376 /* even if nothing consume the result, we can't DCE the instruction: */
377 array_insert(b, b->keeps, atomic);
378
379 return atomic;
380 }
381
382 const struct ir3_context_funcs ir3_a6xx_funcs = {
383 .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
384 .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
385 .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
386 .emit_intrinsic_load_image = emit_intrinsic_load_image,
387 .emit_intrinsic_store_image = emit_intrinsic_store_image,
388 .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
389 };
390
391 /*
392 * Special pass to run after instruction scheduling to insert an
393 * extra mov from src1.x to dst. This way the other compiler passes
394 * can ignore this quirk of the new instruction encoding.
395 *
396 * This should run after RA.
397 */
398
399 static struct ir3_instruction *
400 get_atomic_dest_mov(struct ir3_instruction *atomic)
401 {
402 struct ir3_instruction *mov;
403
404 /* if we've already created the mov-out, then re-use it: */
405 if (atomic->data)
406 return atomic->data;
407
408 /* We are already out of SSA here, so we can't use the nice builders: */
409 mov = ir3_instr_create(atomic->block, OPC_MOV);
410 ir3_reg_create(mov, 0, 0); /* dst */
411 ir3_reg_create(mov, 0, 0); /* src */
412
413 mov->cat1.src_type = TYPE_U32;
414 mov->cat1.dst_type = TYPE_U32;
415
416 /* extract back out the 'dummy' which serves as stand-in for dest: */
417 struct ir3_instruction *src = atomic->regs[3]->instr;
418 debug_assert(src->opc == OPC_META_COLLECT);
419
420 *mov->regs[0] = *atomic->regs[0];
421 *mov->regs[1] = *src->regs[1]->instr->regs[0];
422
423 mov->flags |= IR3_INSTR_SY;
424
425 /* it will have already been appended to the end of the block, which
426 * isn't where we want it, so fix-up the location:
427 */
428 list_delinit(&mov->node);
429 list_add(&mov->node, &atomic->node);
430
431 return atomic->data = mov;
432 }
433
434 bool
435 ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
436 {
437 bool progress = false;
438
439 if (ir3_shader_nibo(so) == 0)
440 return false;
441
442 foreach_block (block, &ir->block_list) {
443 foreach_instr (instr, &block->instr_list) {
444 instr->data = NULL;
445 }
446 }
447
448 foreach_block (block, &ir->block_list) {
449 foreach_instr_safe (instr, &block->instr_list) {
450 struct ir3_register *reg;
451
452 foreach_src (reg, instr) {
453 struct ir3_instruction *src = reg->instr;
454
455 if (!src)
456 continue;
457
458 if (is_atomic(src->opc) && (src->flags & IR3_INSTR_G)) {
459 reg->instr = get_atomic_dest_mov(src);
460 progress = true;
461 }
462 }
463 }
464 }
465
466 /* we also need to fixup shader outputs: */
467 struct ir3_instruction *out;
468 foreach_output_n (out, n, ir) {
469 if (is_atomic(out->opc) && (out->flags & IR3_INSTR_G)) {
470 ir->outputs[n] = get_atomic_dest_mov(out);
471 progress = true;
472 }
473 }
474
475 return progress;
476 }