2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "ir3_context.h"
30 #include "ir3_image.h"
33 * Handlers for instructions changed/added in a4xx:
37 /* src[] = { buffer_index, offset }. No const_index */
39 emit_intrinsic_load_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
,
40 struct ir3_instruction
**dst
)
42 struct ir3_block
*b
= ctx
->block
;
43 struct ir3_instruction
*ldgb
, *src0
, *src1
, *byte_offset
, *offset
;
45 /* can this be non-const buffer_index? how do we handle that? */
46 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, nir_src_as_uint(intr
->src
[0]));
48 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
49 offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
51 /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
52 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
58 ldgb
= ir3_LDGB(b
, create_immed(b
, ibo_idx
), 0,
60 ldgb
->regs
[0]->wrmask
= MASK(intr
->num_components
);
61 ldgb
->cat6
.iim_val
= intr
->num_components
;
63 ldgb
->cat6
.type
= TYPE_U32
;
64 ldgb
->barrier_class
= IR3_BARRIER_BUFFER_R
;
65 ldgb
->barrier_conflict
= IR3_BARRIER_BUFFER_W
;
67 ir3_split_dest(b
, dst
, ldgb
, 0, intr
->num_components
);
70 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
72 emit_intrinsic_store_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
74 struct ir3_block
*b
= ctx
->block
;
75 struct ir3_instruction
*stgb
, *src0
, *src1
, *src2
, *byte_offset
, *offset
;
76 /* TODO handle wrmask properly, see _store_shared().. but I think
77 * it is more a PITA than that, since blob ends up loading the
78 * masked components and writing them back out.
80 unsigned wrmask
= intr
->const_index
[0];
81 unsigned ncomp
= ffs(~wrmask
) - 1;
83 /* can this be non-const buffer_index? how do we handle that? */
84 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, nir_src_as_uint(intr
->src
[1]));
86 byte_offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
87 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
89 /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
92 src0
= ir3_create_collect(ctx
, ir3_get_src(ctx
, &intr
->src
[0]), ncomp
);
94 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
99 stgb
= ir3_STGB(b
, create_immed(b
, ibo_idx
), 0, src0
, 0, src1
, 0, src2
, 0);
100 stgb
->cat6
.iim_val
= ncomp
;
102 stgb
->cat6
.type
= TYPE_U32
;
103 stgb
->barrier_class
= IR3_BARRIER_BUFFER_W
;
104 stgb
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
106 array_insert(b
, b
->keeps
, stgb
);
110 * SSBO atomic intrinsics
112 * All of the SSBO atomic memory operations read a value from memory,
113 * compute a new value using one of the operations below, write the new
114 * value to memory, and return the original value read.
116 * All operations take 3 sources except CompSwap that takes 4. These
119 * 0: The SSBO buffer index.
120 * 1: The offset into the SSBO buffer of the variable that the atomic
121 * operation will operate on.
122 * 2: The data parameter to the atomic function (i.e. the value to add
123 * in ssbo_atomic_add, etc).
124 * 3: For CompSwap only: the second data parameter.
126 static struct ir3_instruction
*
127 emit_intrinsic_atomic_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
129 struct ir3_block
*b
= ctx
->block
;
130 struct ir3_instruction
*atomic
, *ssbo
, *src0
, *src1
, *src2
, *byte_offset
,
132 type_t type
= TYPE_U32
;
134 /* can this be non-const buffer_index? how do we handle that? */
135 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, nir_src_as_uint(intr
->src
[0]));
136 ssbo
= create_immed(b
, ibo_idx
);
138 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
139 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
141 /* src0 is data (or uvec2(data, compare))
143 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
145 * Note that nir already multiplies the offset by four
147 src0
= ir3_get_src(ctx
, &intr
->src
[2])[0];
149 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
154 switch (intr
->intrinsic
) {
155 case nir_intrinsic_ssbo_atomic_add_ir3
:
156 atomic
= ir3_ATOMIC_ADD_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
158 case nir_intrinsic_ssbo_atomic_imin_ir3
:
159 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
162 case nir_intrinsic_ssbo_atomic_umin_ir3
:
163 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
165 case nir_intrinsic_ssbo_atomic_imax_ir3
:
166 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
169 case nir_intrinsic_ssbo_atomic_umax_ir3
:
170 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
172 case nir_intrinsic_ssbo_atomic_and_ir3
:
173 atomic
= ir3_ATOMIC_AND_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
175 case nir_intrinsic_ssbo_atomic_or_ir3
:
176 atomic
= ir3_ATOMIC_OR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
178 case nir_intrinsic_ssbo_atomic_xor_ir3
:
179 atomic
= ir3_ATOMIC_XOR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
181 case nir_intrinsic_ssbo_atomic_exchange_ir3
:
182 atomic
= ir3_ATOMIC_XCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
184 case nir_intrinsic_ssbo_atomic_comp_swap_ir3
:
185 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
186 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
187 ir3_get_src(ctx
, &intr
->src
[3])[0],
190 src1
= ir3_get_src(ctx
, &intr
->src
[4])[0];
191 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
197 atomic
->cat6
.iim_val
= 1;
199 atomic
->cat6
.type
= type
;
200 atomic
->barrier_class
= IR3_BARRIER_BUFFER_W
;
201 atomic
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
203 /* even if nothing consume the result, we can't DCE the instruction: */
204 array_insert(b
, b
->keeps
, atomic
);
209 static struct ir3_instruction
*
210 get_image_offset(struct ir3_context
*ctx
, const nir_variable
*var
,
211 struct ir3_instruction
* const *coords
, bool byteoff
)
213 struct ir3_block
*b
= ctx
->block
;
214 struct ir3_instruction
*offset
;
215 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
217 /* to calculate the byte offset (yes, uggg) we need (up to) three
218 * const values to know the bytes per pixel, and y and z stride:
220 struct ir3_const_state
*const_state
= &ctx
->so
->shader
->const_state
;
221 unsigned cb
= regid(const_state
->offsets
.image_dims
, 0) +
222 const_state
->image_dims
.off
[var
->data
.driver_location
];
224 debug_assert(const_state
->image_dims
.mask
&
225 (1 << var
->data
.driver_location
));
227 /* offset = coords.x * bytes_per_pixel: */
228 offset
= ir3_MUL_S(b
, coords
[0], 0, create_uniform(b
, cb
+ 0), 0);
230 /* offset += coords.y * y_pitch: */
231 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 1), 0,
232 coords
[1], 0, offset
, 0);
235 /* offset += coords.z * z_pitch: */
236 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 2), 0,
237 coords
[2], 0, offset
, 0);
241 /* Some cases, like atomics, seem to use dword offset instead
242 * of byte offsets.. blob just puts an extra shr.b in there
245 offset
= ir3_SHR_B(b
, offset
, 0, create_immed(b
, 2), 0);
248 return ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
254 /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
256 emit_intrinsic_store_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
258 struct ir3_block
*b
= ctx
->block
;
259 const nir_variable
*var
= nir_intrinsic_get_var(intr
, 0);
260 struct ir3_instruction
*stib
, *offset
;
261 struct ir3_instruction
* const *value
= ir3_get_src(ctx
, &intr
->src
[3]);
262 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
263 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
264 unsigned slot
= ir3_get_image_slot(nir_src_as_deref(intr
->src
[0]));
265 unsigned ibo_idx
= ir3_image_to_ibo(&ctx
->so
->image_mapping
, slot
);
266 unsigned ncomp
= ir3_get_num_components_for_glformat(var
->data
.image
.format
);
270 * src2 is 64b byte offset
273 offset
= get_image_offset(ctx
, var
, coords
, true);
275 /* NOTE: stib seems to take byte offset, but stgb.typed can be used
276 * too and takes a dword offset.. not quite sure yet why blob uses
277 * one over the other in various cases.
280 stib
= ir3_STIB(b
, create_immed(b
, ibo_idx
), 0,
281 ir3_create_collect(ctx
, value
, ncomp
), 0,
282 ir3_create_collect(ctx
, coords
, ncoords
), 0,
284 stib
->cat6
.iim_val
= ncomp
;
285 stib
->cat6
.d
= ncoords
;
286 stib
->cat6
.type
= ir3_get_image_type(var
);
287 stib
->cat6
.typed
= true;
288 stib
->barrier_class
= IR3_BARRIER_IMAGE_W
;
289 stib
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
291 array_insert(b
, b
->keeps
, stib
);
294 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
295 static struct ir3_instruction
*
296 emit_intrinsic_atomic_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
298 struct ir3_block
*b
= ctx
->block
;
299 const nir_variable
*var
= nir_intrinsic_get_var(intr
, 0);
300 struct ir3_instruction
*atomic
, *image
, *src0
, *src1
, *src2
;
301 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
302 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
303 unsigned slot
= ir3_get_image_slot(nir_src_as_deref(intr
->src
[0]));
304 unsigned ibo_idx
= ir3_image_to_ibo(&ctx
->so
->image_mapping
, slot
);
306 image
= create_immed(b
, ibo_idx
);
308 /* src0 is value (or uvec2(value, compare))
310 * src2 is 64b byte offset
312 src0
= ir3_get_src(ctx
, &intr
->src
[3])[0];
313 src1
= ir3_create_collect(ctx
, coords
, ncoords
);
314 src2
= get_image_offset(ctx
, var
, coords
, false);
316 switch (intr
->intrinsic
) {
317 case nir_intrinsic_image_deref_atomic_add
:
318 atomic
= ir3_ATOMIC_ADD_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
320 case nir_intrinsic_image_deref_atomic_imin
:
321 case nir_intrinsic_image_deref_atomic_umin
:
322 atomic
= ir3_ATOMIC_MIN_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
324 case nir_intrinsic_image_deref_atomic_imax
:
325 case nir_intrinsic_image_deref_atomic_umax
:
326 atomic
= ir3_ATOMIC_MAX_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
328 case nir_intrinsic_image_deref_atomic_and
:
329 atomic
= ir3_ATOMIC_AND_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
331 case nir_intrinsic_image_deref_atomic_or
:
332 atomic
= ir3_ATOMIC_OR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
334 case nir_intrinsic_image_deref_atomic_xor
:
335 atomic
= ir3_ATOMIC_XOR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
337 case nir_intrinsic_image_deref_atomic_exchange
:
338 atomic
= ir3_ATOMIC_XCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
340 case nir_intrinsic_image_deref_atomic_comp_swap
:
341 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
342 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
343 ir3_get_src(ctx
, &intr
->src
[4])[0],
346 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
352 atomic
->cat6
.iim_val
= 1;
353 atomic
->cat6
.d
= ncoords
;
354 atomic
->cat6
.type
= ir3_get_image_type(var
);
355 atomic
->cat6
.typed
= true;
356 atomic
->barrier_class
= IR3_BARRIER_IMAGE_W
;
357 atomic
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
359 /* even if nothing consume the result, we can't DCE the instruction: */
360 array_insert(b
, b
->keeps
, atomic
);
365 const struct ir3_context_funcs ir3_a4xx_funcs
= {
366 .emit_intrinsic_load_ssbo
= emit_intrinsic_load_ssbo
,
367 .emit_intrinsic_store_ssbo
= emit_intrinsic_store_ssbo
,
368 .emit_intrinsic_atomic_ssbo
= emit_intrinsic_atomic_ssbo
,
369 .emit_intrinsic_store_image
= emit_intrinsic_store_image
,
370 .emit_intrinsic_atomic_image
= emit_intrinsic_atomic_image
,