freedreno/ir3: make input/output iterators declare cursor ptr
[mesa.git] / src / freedreno / ir3 / ir3_a6xx.c
1 /*
2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #define GPU 600
28
29 #include "ir3_context.h"
30 #include "ir3_image.h"
31
32 /*
33 * Handlers for instructions changed/added in a6xx:
34 *
35 * Starting with a6xx, isam and stbi is used for SSBOs as well; stbi and the
36 * atomic instructions (used for both SSBO and image) use a new instruction
37 * encoding compared to a4xx/a5xx.
38 */
39
40 static void
41 handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
42 {
43 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
44 if (!intrin)
45 return;
46
47 instr->flags |= IR3_INSTR_B;
48 instr->cat6.base = nir_intrinsic_desc_set(intrin);
49 }
50
51 static struct ir3_instruction *
52 ssbo_idx(struct ir3_context *ctx, nir_src src)
53 {
54 if (ir3_bindless_resource(src)) {
55 ctx->so->bindless_ibo = true;
56 return ir3_get_src(ctx, &src)[0];
57 } else {
58 /* can this be non-const buffer_index? how do we handle that? */
59 int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(src));
60 return create_immed(ctx->block, ibo_idx);
61 }
62 }
63
64 static struct ir3_instruction *
65 image_idx(struct ir3_context *ctx, nir_src src)
66 {
67 if (ir3_bindless_resource(src)) {
68 ctx->so->bindless_ibo = true;
69 return ir3_get_src(ctx, &src)[0];
70 } else {
71 /* can this be non-const buffer_index? how do we handle that? */
72 int ibo_idx = ir3_image_to_ibo(ctx->so->shader, nir_src_as_uint(src));
73 return create_immed(ctx->block, ibo_idx);
74 }
75 }
76
77 /* src[] = { buffer_index, offset }. No const_index */
78 static void
79 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
80 struct ir3_instruction **dst)
81 {
82 struct ir3_block *b = ctx->block;
83 struct ir3_instruction *offset;
84 struct ir3_instruction *ldib;
85
86 offset = ir3_get_src(ctx, &intr->src[2])[0];
87
88 ldib = ir3_LDIB(b, ssbo_idx(ctx, intr->src[0]), 0, offset, 0);
89 ldib->regs[0]->wrmask = MASK(intr->num_components);
90 ldib->cat6.iim_val = intr->num_components;
91 ldib->cat6.d = 1;
92 ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
93 ldib->barrier_class = IR3_BARRIER_BUFFER_R;
94 ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
95 handle_bindless_cat6(ldib, intr->src[0]);
96
97 ir3_split_dest(b, dst, ldib, 0, intr->num_components);
98 }
99
100 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
101 static void
102 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
103 {
104 struct ir3_block *b = ctx->block;
105 struct ir3_instruction *stib, *val, *offset;
106 unsigned wrmask = nir_intrinsic_write_mask(intr);
107 unsigned ncomp = ffs(~wrmask) - 1;
108
109 assert(wrmask == BITFIELD_MASK(intr->num_components));
110
111 /* src0 is offset, src1 is value:
112 */
113 val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
114 offset = ir3_get_src(ctx, &intr->src[3])[0];
115
116 stib = ir3_STIB(b, ssbo_idx(ctx, intr->src[1]), 0, offset, 0, val, 0);
117 stib->cat6.iim_val = ncomp;
118 stib->cat6.d = 1;
119 stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
120 stib->barrier_class = IR3_BARRIER_BUFFER_W;
121 stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
122 handle_bindless_cat6(stib, intr->src[1]);
123
124 array_insert(b, b->keeps, stib);
125 }
126
127 /*
128 * SSBO atomic intrinsics
129 *
130 * All of the SSBO atomic memory operations read a value from memory,
131 * compute a new value using one of the operations below, write the new
132 * value to memory, and return the original value read.
133 *
134 * All operations take 3 sources except CompSwap that takes 4. These
135 * sources represent:
136 *
137 * 0: The SSBO buffer index.
138 * 1: The offset into the SSBO buffer of the variable that the atomic
139 * operation will operate on.
140 * 2: The data parameter to the atomic function (i.e. the value to add
141 * in ssbo_atomic_add, etc).
142 * 3: For CompSwap only: the second data parameter.
143 */
144 static struct ir3_instruction *
145 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
146 {
147 struct ir3_block *b = ctx->block;
148 struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
149 type_t type = TYPE_U32;
150
151 ibo = ssbo_idx(ctx, intr->src[0]);
152
153 data = ir3_get_src(ctx, &intr->src[2])[0];
154
155 /* So this gets a bit creative:
156 *
157 * src0 - vecN offset/coords
158 * src1.x - is actually destination register
159 * src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
160 * src1.z - is 'data' for cmpxchg
161 *
162 * The combining src and dest kinda doesn't work out so well with how
163 * scheduling and RA work. So for now we create a dummy src2.x, and
164 * then in a later fixup path, insert an extra MOV out of src1.x.
165 * See ir3_a6xx_fixup_atomic_dests().
166 *
167 * Note that nir already multiplies the offset by four
168 */
169 dummy = create_immed(b, 0);
170
171 if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
172 src0 = ir3_get_src(ctx, &intr->src[4])[0];
173 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
174 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
175 dummy, compare, data
176 }, 3);
177 } else {
178 src0 = ir3_get_src(ctx, &intr->src[3])[0];
179 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
180 dummy, data
181 }, 2);
182 }
183
184 switch (intr->intrinsic) {
185 case nir_intrinsic_ssbo_atomic_add_ir3:
186 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
187 break;
188 case nir_intrinsic_ssbo_atomic_imin_ir3:
189 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
190 type = TYPE_S32;
191 break;
192 case nir_intrinsic_ssbo_atomic_umin_ir3:
193 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
194 break;
195 case nir_intrinsic_ssbo_atomic_imax_ir3:
196 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
197 type = TYPE_S32;
198 break;
199 case nir_intrinsic_ssbo_atomic_umax_ir3:
200 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
201 break;
202 case nir_intrinsic_ssbo_atomic_and_ir3:
203 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
204 break;
205 case nir_intrinsic_ssbo_atomic_or_ir3:
206 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
207 break;
208 case nir_intrinsic_ssbo_atomic_xor_ir3:
209 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
210 break;
211 case nir_intrinsic_ssbo_atomic_exchange_ir3:
212 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
213 break;
214 case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
215 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
216 break;
217 default:
218 unreachable("boo");
219 }
220
221 atomic->cat6.iim_val = 1;
222 atomic->cat6.d = 1;
223 atomic->cat6.type = type;
224 atomic->barrier_class = IR3_BARRIER_BUFFER_W;
225 atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
226 handle_bindless_cat6(atomic, intr->src[0]);
227
228 /* even if nothing consume the result, we can't DCE the instruction: */
229 array_insert(b, b->keeps, atomic);
230
231 return atomic;
232 }
233
234 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
235 static void
236 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
237 struct ir3_instruction **dst)
238 {
239 struct ir3_block *b = ctx->block;
240 struct ir3_instruction *ldib;
241 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
242 unsigned ncoords = ir3_get_image_coords(intr, NULL);
243
244 ldib = ir3_LDIB(b, image_idx(ctx, intr->src[0]), 0,
245 ir3_create_collect(ctx, coords, ncoords), 0);
246 ldib->regs[0]->wrmask = MASK(intr->num_components);
247 ldib->cat6.iim_val = intr->num_components;
248 ldib->cat6.d = ncoords;
249 ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
250 ldib->cat6.typed = true;
251 ldib->barrier_class = IR3_BARRIER_IMAGE_R;
252 ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
253 handle_bindless_cat6(ldib, intr->src[0]);
254
255 ir3_split_dest(b, dst, ldib, 0, intr->num_components);
256 }
257
258 /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
259 static void
260 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
261 {
262 struct ir3_block *b = ctx->block;
263 struct ir3_instruction *stib;
264 struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
265 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
266 unsigned ncoords = ir3_get_image_coords(intr, NULL);
267 enum pipe_format format = nir_intrinsic_format(intr);
268 unsigned ncomp = ir3_get_num_components_for_image_format(format);
269
270 /* src0 is offset, src1 is value:
271 */
272 stib = ir3_STIB(b, image_idx(ctx, intr->src[0]), 0,
273 ir3_create_collect(ctx, coords, ncoords), 0,
274 ir3_create_collect(ctx, value, ncomp), 0);
275 stib->cat6.iim_val = ncomp;
276 stib->cat6.d = ncoords;
277 stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
278 stib->cat6.typed = true;
279 stib->barrier_class = IR3_BARRIER_IMAGE_W;
280 stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
281 handle_bindless_cat6(stib, intr->src[0]);
282
283 array_insert(b, b->keeps, stib);
284 }
285
286 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
287 static struct ir3_instruction *
288 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
289 {
290 struct ir3_block *b = ctx->block;
291 struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
292 struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
293 struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
294 unsigned ncoords = ir3_get_image_coords(intr, NULL);
295
296 ibo = image_idx(ctx, intr->src[0]);
297
298 /* So this gets a bit creative:
299 *
300 * src0 - vecN offset/coords
301 * src1.x - is actually destination register
302 * src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
303 * src1.z - is 'value' for cmpxchg
304 *
305 * The combining src and dest kinda doesn't work out so well with how
306 * scheduling and RA work. So for now we create a dummy src2.x, and
307 * then in a later fixup path, insert an extra MOV out of src1.x.
308 * See ir3_a6xx_fixup_atomic_dests().
309 */
310 dummy = create_immed(b, 0);
311 src0 = ir3_create_collect(ctx, coords, ncoords);
312
313 if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
314 intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
315 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
316 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
317 dummy, compare, value
318 }, 3);
319 } else {
320 src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
321 dummy, value
322 }, 2);
323 }
324
325 switch (intr->intrinsic) {
326 case nir_intrinsic_image_atomic_add:
327 case nir_intrinsic_bindless_image_atomic_add:
328 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
329 break;
330 case nir_intrinsic_image_atomic_imin:
331 case nir_intrinsic_image_atomic_umin:
332 case nir_intrinsic_bindless_image_atomic_imin:
333 case nir_intrinsic_bindless_image_atomic_umin:
334 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
335 break;
336 case nir_intrinsic_image_atomic_imax:
337 case nir_intrinsic_image_atomic_umax:
338 case nir_intrinsic_bindless_image_atomic_imax:
339 case nir_intrinsic_bindless_image_atomic_umax:
340 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
341 break;
342 case nir_intrinsic_image_atomic_and:
343 case nir_intrinsic_bindless_image_atomic_and:
344 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
345 break;
346 case nir_intrinsic_image_atomic_or:
347 case nir_intrinsic_bindless_image_atomic_or:
348 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
349 break;
350 case nir_intrinsic_image_atomic_xor:
351 case nir_intrinsic_bindless_image_atomic_xor:
352 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
353 break;
354 case nir_intrinsic_image_atomic_exchange:
355 case nir_intrinsic_bindless_image_atomic_exchange:
356 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
357 break;
358 case nir_intrinsic_image_atomic_comp_swap:
359 case nir_intrinsic_bindless_image_atomic_comp_swap:
360 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
361 break;
362 default:
363 unreachable("boo");
364 }
365
366 atomic->cat6.iim_val = 1;
367 atomic->cat6.d = ncoords;
368 atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
369 atomic->cat6.typed = true;
370 atomic->barrier_class = IR3_BARRIER_IMAGE_W;
371 atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
372 handle_bindless_cat6(atomic, intr->src[0]);
373
374 /* even if nothing consume the result, we can't DCE the instruction: */
375 array_insert(b, b->keeps, atomic);
376
377 return atomic;
378 }
379
380 const struct ir3_context_funcs ir3_a6xx_funcs = {
381 .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
382 .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
383 .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
384 .emit_intrinsic_load_image = emit_intrinsic_load_image,
385 .emit_intrinsic_store_image = emit_intrinsic_store_image,
386 .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
387 };
388
389 /*
390 * Special pass to run after instruction scheduling to insert an
391 * extra mov from src1.x to dst. This way the other compiler passes
392 * can ignore this quirk of the new instruction encoding.
393 *
394 * This should run after RA.
395 */
396
397 static struct ir3_instruction *
398 get_atomic_dest_mov(struct ir3_instruction *atomic)
399 {
400 struct ir3_instruction *mov;
401
402 /* if we've already created the mov-out, then re-use it: */
403 if (atomic->data)
404 return atomic->data;
405
406 /* We are already out of SSA here, so we can't use the nice builders: */
407 mov = ir3_instr_create(atomic->block, OPC_MOV);
408 ir3_reg_create(mov, 0, 0); /* dst */
409 ir3_reg_create(mov, 0, 0); /* src */
410
411 mov->cat1.src_type = TYPE_U32;
412 mov->cat1.dst_type = TYPE_U32;
413
414 /* extract back out the 'dummy' which serves as stand-in for dest: */
415 struct ir3_instruction *src = atomic->regs[3]->instr;
416 debug_assert(src->opc == OPC_META_COLLECT);
417
418 *mov->regs[0] = *atomic->regs[0];
419 *mov->regs[1] = *src->regs[1]->instr->regs[0];
420
421 mov->flags |= IR3_INSTR_SY;
422
423 /* it will have already been appended to the end of the block, which
424 * isn't where we want it, so fix-up the location:
425 */
426 list_delinit(&mov->node);
427 list_add(&mov->node, &atomic->node);
428
429 return atomic->data = mov;
430 }
431
432 bool
433 ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
434 {
435 bool progress = false;
436
437 if (ir3_shader_nibo(so) == 0)
438 return false;
439
440 foreach_block (block, &ir->block_list) {
441 foreach_instr (instr, &block->instr_list) {
442 instr->data = NULL;
443 }
444 }
445
446 foreach_block (block, &ir->block_list) {
447 foreach_instr_safe (instr, &block->instr_list) {
448 foreach_src (reg, instr) {
449 struct ir3_instruction *src = reg->instr;
450
451 if (!src)
452 continue;
453
454 if (is_atomic(src->opc) && (src->flags & IR3_INSTR_G)) {
455 reg->instr = get_atomic_dest_mov(src);
456 progress = true;
457 }
458 }
459 }
460 }
461
462 /* we also need to fixup shader outputs: */
463 foreach_output_n (out, n, ir) {
464 if (is_atomic(out->opc) && (out->flags & IR3_INSTR_G)) {
465 ir->outputs[n] = get_atomic_dest_mov(out);
466 progress = true;
467 }
468 }
469
470 return progress;
471 }