2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "ir3_context.h"
30 #include "ir3_image.h"
33 * Handlers for instructions changed/added in a4xx:
37 /* src[] = { buffer_index, offset }. No const_index */
39 emit_intrinsic_load_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
,
40 struct ir3_instruction
**dst
)
42 struct ir3_block
*b
= ctx
->block
;
43 struct ir3_instruction
*ldgb
, *src0
, *src1
, *byte_offset
, *offset
;
44 nir_const_value
*const_offset
;
46 /* can this be non-const buffer_index? how do we handle that? */
47 const_offset
= nir_src_as_const_value(intr
->src
[0]);
48 compile_assert(ctx
, const_offset
);
50 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, const_offset
->u32
[0]);
52 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
53 offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
55 /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
56 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
62 ldgb
= ir3_LDGB(b
, create_immed(b
, ibo_idx
), 0,
64 ldgb
->regs
[0]->wrmask
= MASK(intr
->num_components
);
65 ldgb
->cat6
.iim_val
= intr
->num_components
;
67 ldgb
->cat6
.type
= TYPE_U32
;
68 ldgb
->barrier_class
= IR3_BARRIER_BUFFER_R
;
69 ldgb
->barrier_conflict
= IR3_BARRIER_BUFFER_W
;
71 ir3_split_dest(b
, dst
, ldgb
, 0, intr
->num_components
);
74 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
76 emit_intrinsic_store_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
78 struct ir3_block
*b
= ctx
->block
;
79 struct ir3_instruction
*stgb
, *src0
, *src1
, *src2
, *byte_offset
, *offset
;
80 nir_const_value
*const_offset
;
81 /* TODO handle wrmask properly, see _store_shared().. but I think
82 * it is more a PITA than that, since blob ends up loading the
83 * masked components and writing them back out.
85 unsigned wrmask
= intr
->const_index
[0];
86 unsigned ncomp
= ffs(~wrmask
) - 1;
88 /* can this be non-const buffer_index? how do we handle that? */
89 const_offset
= nir_src_as_const_value(intr
->src
[1]);
90 compile_assert(ctx
, const_offset
);
92 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, const_offset
->u32
[0]);
94 byte_offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
95 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
97 /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
100 src0
= ir3_create_collect(ctx
, ir3_get_src(ctx
, &intr
->src
[0]), ncomp
);
102 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
107 stgb
= ir3_STGB(b
, create_immed(b
, ibo_idx
), 0, src0
, 0, src1
, 0, src2
, 0);
108 stgb
->cat6
.iim_val
= ncomp
;
110 stgb
->cat6
.type
= TYPE_U32
;
111 stgb
->barrier_class
= IR3_BARRIER_BUFFER_W
;
112 stgb
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
114 array_insert(b
, b
->keeps
, stgb
);
118 * SSBO atomic intrinsics
120 * All of the SSBO atomic memory operations read a value from memory,
121 * compute a new value using one of the operations below, write the new
122 * value to memory, and return the original value read.
124 * All operations take 3 sources except CompSwap that takes 4. These
127 * 0: The SSBO buffer index.
128 * 1: The offset into the SSBO buffer of the variable that the atomic
129 * operation will operate on.
130 * 2: The data parameter to the atomic function (i.e. the value to add
131 * in ssbo_atomic_add, etc).
132 * 3: For CompSwap only: the second data parameter.
134 static struct ir3_instruction
*
135 emit_intrinsic_atomic_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
137 struct ir3_block
*b
= ctx
->block
;
138 struct ir3_instruction
*atomic
, *ssbo
, *src0
, *src1
, *src2
, *byte_offset
,
140 nir_const_value
*const_offset
;
141 type_t type
= TYPE_U32
;
143 /* can this be non-const buffer_index? how do we handle that? */
144 const_offset
= nir_src_as_const_value(intr
->src
[0]);
145 compile_assert(ctx
, const_offset
);
147 int ibo_idx
= ir3_ssbo_to_ibo(&ctx
->so
->image_mapping
, const_offset
->u32
[0]);
148 ssbo
= create_immed(b
, ibo_idx
);
150 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
151 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
153 /* src0 is data (or uvec2(data, compare))
155 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
157 * Note that nir already multiplies the offset by four
159 src0
= ir3_get_src(ctx
, &intr
->src
[2])[0];
161 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
166 switch (intr
->intrinsic
) {
167 case nir_intrinsic_ssbo_atomic_add_ir3
:
168 atomic
= ir3_ATOMIC_ADD_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
170 case nir_intrinsic_ssbo_atomic_imin_ir3
:
171 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
174 case nir_intrinsic_ssbo_atomic_umin_ir3
:
175 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
177 case nir_intrinsic_ssbo_atomic_imax_ir3
:
178 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
181 case nir_intrinsic_ssbo_atomic_umax_ir3
:
182 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
184 case nir_intrinsic_ssbo_atomic_and_ir3
:
185 atomic
= ir3_ATOMIC_AND_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
187 case nir_intrinsic_ssbo_atomic_or_ir3
:
188 atomic
= ir3_ATOMIC_OR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
190 case nir_intrinsic_ssbo_atomic_xor_ir3
:
191 atomic
= ir3_ATOMIC_XOR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
193 case nir_intrinsic_ssbo_atomic_exchange_ir3
:
194 atomic
= ir3_ATOMIC_XCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
196 case nir_intrinsic_ssbo_atomic_comp_swap_ir3
:
197 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
198 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
199 ir3_get_src(ctx
, &intr
->src
[3])[0],
202 src1
= ir3_get_src(ctx
, &intr
->src
[4])[0];
203 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
209 atomic
->cat6
.iim_val
= 1;
211 atomic
->cat6
.type
= type
;
212 atomic
->barrier_class
= IR3_BARRIER_BUFFER_W
;
213 atomic
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
215 /* even if nothing consume the result, we can't DCE the instruction: */
216 array_insert(b
, b
->keeps
, atomic
);
221 static struct ir3_instruction
*
222 get_image_offset(struct ir3_context
*ctx
, const nir_variable
*var
,
223 struct ir3_instruction
* const *coords
, bool byteoff
)
225 struct ir3_block
*b
= ctx
->block
;
226 struct ir3_instruction
*offset
;
227 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
229 /* to calculate the byte offset (yes, uggg) we need (up to) three
230 * const values to know the bytes per pixel, and y and z stride:
232 unsigned cb
= regid(ctx
->so
->constbase
.image_dims
, 0) +
233 ctx
->so
->const_layout
.image_dims
.off
[var
->data
.driver_location
];
235 debug_assert(ctx
->so
->const_layout
.image_dims
.mask
&
236 (1 << var
->data
.driver_location
));
238 /* offset = coords.x * bytes_per_pixel: */
239 offset
= ir3_MUL_S(b
, coords
[0], 0, create_uniform(b
, cb
+ 0), 0);
241 /* offset += coords.y * y_pitch: */
242 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 1), 0,
243 coords
[1], 0, offset
, 0);
246 /* offset += coords.z * z_pitch: */
247 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 2), 0,
248 coords
[2], 0, offset
, 0);
252 /* Some cases, like atomics, seem to use dword offset instead
253 * of byte offsets.. blob just puts an extra shr.b in there
256 offset
= ir3_SHR_B(b
, offset
, 0, create_immed(b
, 2), 0);
259 return ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
265 /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
267 emit_intrinsic_store_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
269 struct ir3_block
*b
= ctx
->block
;
270 const nir_variable
*var
= nir_intrinsic_get_var(intr
, 0);
271 struct ir3_instruction
*stib
, *offset
;
272 struct ir3_instruction
* const *value
= ir3_get_src(ctx
, &intr
->src
[3]);
273 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
274 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
275 unsigned slot
= ir3_get_image_slot(nir_src_as_deref(intr
->src
[0]));
276 unsigned ibo_idx
= ir3_image_to_ibo(&ctx
->so
->image_mapping
, slot
);
277 unsigned ncomp
= ir3_get_num_components_for_glformat(var
->data
.image
.format
);
281 * src2 is 64b byte offset
284 offset
= get_image_offset(ctx
, var
, coords
, true);
286 /* NOTE: stib seems to take byte offset, but stgb.typed can be used
287 * too and takes a dword offset.. not quite sure yet why blob uses
288 * one over the other in various cases.
291 stib
= ir3_STIB(b
, create_immed(b
, ibo_idx
), 0,
292 ir3_create_collect(ctx
, value
, ncomp
), 0,
293 ir3_create_collect(ctx
, coords
, ncoords
), 0,
295 stib
->cat6
.iim_val
= ncomp
;
296 stib
->cat6
.d
= ncoords
;
297 stib
->cat6
.type
= ir3_get_image_type(var
);
298 stib
->cat6
.typed
= true;
299 stib
->barrier_class
= IR3_BARRIER_IMAGE_W
;
300 stib
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
302 array_insert(b
, b
->keeps
, stib
);
305 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
306 static struct ir3_instruction
*
307 emit_intrinsic_atomic_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
309 struct ir3_block
*b
= ctx
->block
;
310 const nir_variable
*var
= nir_intrinsic_get_var(intr
, 0);
311 struct ir3_instruction
*atomic
, *image
, *src0
, *src1
, *src2
;
312 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
313 unsigned ncoords
= ir3_get_image_coords(var
, NULL
);
314 unsigned slot
= ir3_get_image_slot(nir_src_as_deref(intr
->src
[0]));
315 unsigned ibo_idx
= ir3_image_to_ibo(&ctx
->so
->image_mapping
, slot
);
317 image
= create_immed(b
, ibo_idx
);
319 /* src0 is value (or uvec2(value, compare))
321 * src2 is 64b byte offset
323 src0
= ir3_get_src(ctx
, &intr
->src
[3])[0];
324 src1
= ir3_create_collect(ctx
, coords
, ncoords
);
325 src2
= get_image_offset(ctx
, var
, coords
, false);
327 switch (intr
->intrinsic
) {
328 case nir_intrinsic_image_deref_atomic_add
:
329 atomic
= ir3_ATOMIC_ADD_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
331 case nir_intrinsic_image_deref_atomic_min
:
332 atomic
= ir3_ATOMIC_MIN_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
334 case nir_intrinsic_image_deref_atomic_max
:
335 atomic
= ir3_ATOMIC_MAX_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
337 case nir_intrinsic_image_deref_atomic_and
:
338 atomic
= ir3_ATOMIC_AND_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
340 case nir_intrinsic_image_deref_atomic_or
:
341 atomic
= ir3_ATOMIC_OR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
343 case nir_intrinsic_image_deref_atomic_xor
:
344 atomic
= ir3_ATOMIC_XOR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
346 case nir_intrinsic_image_deref_atomic_exchange
:
347 atomic
= ir3_ATOMIC_XCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
349 case nir_intrinsic_image_deref_atomic_comp_swap
:
350 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
351 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
352 ir3_get_src(ctx
, &intr
->src
[4])[0],
355 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
361 atomic
->cat6
.iim_val
= 1;
362 atomic
->cat6
.d
= ncoords
;
363 atomic
->cat6
.type
= ir3_get_image_type(var
);
364 atomic
->cat6
.typed
= true;
365 atomic
->barrier_class
= IR3_BARRIER_IMAGE_W
;
366 atomic
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
368 /* even if nothing consume the result, we can't DCE the instruction: */
369 array_insert(b
, b
->keeps
, atomic
);
374 const struct ir3_context_funcs ir3_a4xx_funcs
= {
375 .emit_intrinsic_load_ssbo
= emit_intrinsic_load_ssbo
,
376 .emit_intrinsic_store_ssbo
= emit_intrinsic_store_ssbo
,
377 .emit_intrinsic_atomic_ssbo
= emit_intrinsic_atomic_ssbo
,
378 .emit_intrinsic_store_image
= emit_intrinsic_store_image
,
379 .emit_intrinsic_atomic_image
= emit_intrinsic_atomic_image
,