2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "ir3_context.h"
30 #include "ir3_image.h"
33 * Handlers for instructions changed/added in a4xx:
37 /* src[] = { buffer_index, offset }. No const_index */
39 emit_intrinsic_load_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
,
40 struct ir3_instruction
**dst
)
42 struct ir3_block
*b
= ctx
->block
;
43 struct ir3_instruction
*ldgb
, *src0
, *src1
, *byte_offset
, *offset
;
45 /* can this be non-const buffer_index? how do we handle that? */
46 int ibo_idx
= ir3_ssbo_to_ibo(ctx
->so
->shader
, nir_src_as_uint(intr
->src
[0]));
48 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
49 offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
51 /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
52 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
58 ldgb
= ir3_LDGB(b
, create_immed(b
, ibo_idx
), 0,
60 ldgb
->regs
[0]->wrmask
= MASK(intr
->num_components
);
61 ldgb
->cat6
.iim_val
= intr
->num_components
;
63 ldgb
->cat6
.type
= TYPE_U32
;
64 ldgb
->barrier_class
= IR3_BARRIER_BUFFER_R
;
65 ldgb
->barrier_conflict
= IR3_BARRIER_BUFFER_W
;
67 ir3_split_dest(b
, dst
, ldgb
, 0, intr
->num_components
);
70 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
72 emit_intrinsic_store_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
74 struct ir3_block
*b
= ctx
->block
;
75 struct ir3_instruction
*stgb
, *src0
, *src1
, *src2
, *byte_offset
, *offset
;
76 /* TODO handle wrmask properly, see _store_shared().. but I think
77 * it is more a PITA than that, since blob ends up loading the
78 * masked components and writing them back out.
80 unsigned wrmask
= intr
->const_index
[0];
81 unsigned ncomp
= ffs(~wrmask
) - 1;
83 /* can this be non-const buffer_index? how do we handle that? */
84 int ibo_idx
= ir3_ssbo_to_ibo(ctx
->so
->shader
, nir_src_as_uint(intr
->src
[1]));
86 byte_offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
87 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
89 /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
92 src0
= ir3_create_collect(ctx
, ir3_get_src(ctx
, &intr
->src
[0]), ncomp
);
94 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
99 stgb
= ir3_STGB(b
, create_immed(b
, ibo_idx
), 0, src0
, 0, src1
, 0, src2
, 0);
100 stgb
->cat6
.iim_val
= ncomp
;
102 stgb
->cat6
.type
= TYPE_U32
;
103 stgb
->barrier_class
= IR3_BARRIER_BUFFER_W
;
104 stgb
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
106 array_insert(b
, b
->keeps
, stgb
);
110 * SSBO atomic intrinsics
112 * All of the SSBO atomic memory operations read a value from memory,
113 * compute a new value using one of the operations below, write the new
114 * value to memory, and return the original value read.
116 * All operations take 3 sources except CompSwap that takes 4. These
119 * 0: The SSBO buffer index.
120 * 1: The offset into the SSBO buffer of the variable that the atomic
121 * operation will operate on.
122 * 2: The data parameter to the atomic function (i.e. the value to add
123 * in ssbo_atomic_add, etc).
124 * 3: For CompSwap only: the second data parameter.
126 static struct ir3_instruction
*
127 emit_intrinsic_atomic_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
129 struct ir3_block
*b
= ctx
->block
;
130 struct ir3_instruction
*atomic
, *ssbo
, *src0
, *src1
, *src2
, *byte_offset
,
132 type_t type
= TYPE_U32
;
134 /* can this be non-const buffer_index? how do we handle that? */
135 int ibo_idx
= ir3_ssbo_to_ibo(ctx
->so
->shader
, nir_src_as_uint(intr
->src
[0]));
136 ssbo
= create_immed(b
, ibo_idx
);
138 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
139 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
141 /* src0 is data (or uvec2(data, compare))
143 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
145 * Note that nir already multiplies the offset by four
147 src0
= ir3_get_src(ctx
, &intr
->src
[2])[0];
149 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
154 switch (intr
->intrinsic
) {
155 case nir_intrinsic_ssbo_atomic_add_ir3
:
156 atomic
= ir3_ATOMIC_ADD_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
158 case nir_intrinsic_ssbo_atomic_imin_ir3
:
159 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
162 case nir_intrinsic_ssbo_atomic_umin_ir3
:
163 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
165 case nir_intrinsic_ssbo_atomic_imax_ir3
:
166 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
169 case nir_intrinsic_ssbo_atomic_umax_ir3
:
170 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
172 case nir_intrinsic_ssbo_atomic_and_ir3
:
173 atomic
= ir3_ATOMIC_AND_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
175 case nir_intrinsic_ssbo_atomic_or_ir3
:
176 atomic
= ir3_ATOMIC_OR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
178 case nir_intrinsic_ssbo_atomic_xor_ir3
:
179 atomic
= ir3_ATOMIC_XOR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
181 case nir_intrinsic_ssbo_atomic_exchange_ir3
:
182 atomic
= ir3_ATOMIC_XCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
184 case nir_intrinsic_ssbo_atomic_comp_swap_ir3
:
185 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
186 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
187 ir3_get_src(ctx
, &intr
->src
[3])[0],
190 src1
= ir3_get_src(ctx
, &intr
->src
[4])[0];
191 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
197 atomic
->cat6
.iim_val
= 1;
199 atomic
->cat6
.type
= type
;
200 atomic
->barrier_class
= IR3_BARRIER_BUFFER_W
;
201 atomic
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
203 /* even if nothing consume the result, we can't DCE the instruction: */
204 array_insert(b
, b
->keeps
, atomic
);
209 static struct ir3_instruction
*
210 get_image_offset(struct ir3_context
*ctx
, const nir_intrinsic_instr
*instr
,
211 struct ir3_instruction
* const *coords
, bool byteoff
)
213 struct ir3_block
*b
= ctx
->block
;
214 struct ir3_instruction
*offset
;
215 unsigned index
= nir_src_as_uint(instr
->src
[0]);
216 unsigned ncoords
= ir3_get_image_coords(instr
, NULL
);
218 /* to calculate the byte offset (yes, uggg) we need (up to) three
219 * const values to know the bytes per pixel, and y and z stride:
221 struct ir3_const_state
*const_state
= &ctx
->so
->shader
->const_state
;
222 unsigned cb
= regid(const_state
->offsets
.image_dims
, 0) +
223 const_state
->image_dims
.off
[index
];
225 debug_assert(const_state
->image_dims
.mask
& (1 << index
));
227 /* offset = coords.x * bytes_per_pixel: */
228 offset
= ir3_MUL_S24(b
, coords
[0], 0, create_uniform(b
, cb
+ 0), 0);
230 /* offset += coords.y * y_pitch: */
231 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 1), 0,
232 coords
[1], 0, offset
, 0);
235 /* offset += coords.z * z_pitch: */
236 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 2), 0,
237 coords
[2], 0, offset
, 0);
241 /* Some cases, like atomics, seem to use dword offset instead
242 * of byte offsets.. blob just puts an extra shr.b in there
245 offset
= ir3_SHR_B(b
, offset
, 0, create_immed(b
, 2), 0);
248 return ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
254 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
256 emit_intrinsic_store_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
258 struct ir3_block
*b
= ctx
->block
;
259 struct ir3_instruction
*stib
, *offset
;
260 struct ir3_instruction
* const *value
= ir3_get_src(ctx
, &intr
->src
[3]);
261 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
262 unsigned ncoords
= ir3_get_image_coords(intr
, NULL
);
263 unsigned slot
= nir_src_as_uint(intr
->src
[0]);
264 unsigned ibo_idx
= ir3_image_to_ibo(ctx
->so
->shader
, slot
);
265 unsigned ncomp
= ir3_get_num_components_for_image_format(nir_intrinsic_format(intr
));
269 * src2 is 64b byte offset
272 offset
= get_image_offset(ctx
, intr
, coords
, true);
274 /* NOTE: stib seems to take byte offset, but stgb.typed can be used
275 * too and takes a dword offset.. not quite sure yet why blob uses
276 * one over the other in various cases.
279 stib
= ir3_STIB(b
, create_immed(b
, ibo_idx
), 0,
280 ir3_create_collect(ctx
, value
, ncomp
), 0,
281 ir3_create_collect(ctx
, coords
, ncoords
), 0,
283 stib
->cat6
.iim_val
= ncomp
;
284 stib
->cat6
.d
= ncoords
;
285 stib
->cat6
.type
= ir3_get_type_for_image_intrinsic(intr
);
286 stib
->cat6
.typed
= true;
287 stib
->barrier_class
= IR3_BARRIER_IMAGE_W
;
288 stib
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
290 array_insert(b
, b
->keeps
, stib
);
293 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
294 static struct ir3_instruction
*
295 emit_intrinsic_atomic_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
297 struct ir3_block
*b
= ctx
->block
;
298 struct ir3_instruction
*atomic
, *image
, *src0
, *src1
, *src2
;
299 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
300 unsigned ncoords
= ir3_get_image_coords(intr
, NULL
);
301 unsigned slot
= nir_src_as_uint(intr
->src
[0]);
302 unsigned ibo_idx
= ir3_image_to_ibo(ctx
->so
->shader
, slot
);
304 image
= create_immed(b
, ibo_idx
);
306 /* src0 is value (or uvec2(value, compare))
308 * src2 is 64b byte offset
310 src0
= ir3_get_src(ctx
, &intr
->src
[3])[0];
311 src1
= ir3_create_collect(ctx
, coords
, ncoords
);
312 src2
= get_image_offset(ctx
, intr
, coords
, false);
314 switch (intr
->intrinsic
) {
315 case nir_intrinsic_image_atomic_add
:
316 atomic
= ir3_ATOMIC_ADD_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
318 case nir_intrinsic_image_atomic_imin
:
319 case nir_intrinsic_image_atomic_umin
:
320 atomic
= ir3_ATOMIC_MIN_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
322 case nir_intrinsic_image_atomic_imax
:
323 case nir_intrinsic_image_atomic_umax
:
324 atomic
= ir3_ATOMIC_MAX_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
326 case nir_intrinsic_image_atomic_and
:
327 atomic
= ir3_ATOMIC_AND_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
329 case nir_intrinsic_image_atomic_or
:
330 atomic
= ir3_ATOMIC_OR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
332 case nir_intrinsic_image_atomic_xor
:
333 atomic
= ir3_ATOMIC_XOR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
335 case nir_intrinsic_image_atomic_exchange
:
336 atomic
= ir3_ATOMIC_XCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
338 case nir_intrinsic_image_atomic_comp_swap
:
339 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
340 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
341 ir3_get_src(ctx
, &intr
->src
[4])[0],
344 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
350 atomic
->cat6
.iim_val
= 1;
351 atomic
->cat6
.d
= ncoords
;
352 atomic
->cat6
.type
= ir3_get_type_for_image_intrinsic(intr
);
353 atomic
->cat6
.typed
= true;
354 atomic
->barrier_class
= IR3_BARRIER_IMAGE_W
;
355 atomic
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
357 /* even if nothing consume the result, we can't DCE the instruction: */
358 array_insert(b
, b
->keeps
, atomic
);
363 const struct ir3_context_funcs ir3_a4xx_funcs
= {
364 .emit_intrinsic_load_ssbo
= emit_intrinsic_load_ssbo
,
365 .emit_intrinsic_store_ssbo
= emit_intrinsic_store_ssbo
,
366 .emit_intrinsic_atomic_ssbo
= emit_intrinsic_atomic_ssbo
,
367 .emit_intrinsic_store_image
= emit_intrinsic_store_image
,
368 .emit_intrinsic_atomic_image
= emit_intrinsic_atomic_image
,