2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "ir3_context.h"
30 #include "ir3_image.h"
33 * Handlers for instructions changed/added in a4xx:
37 /* src[] = { buffer_index, offset }. No const_index */
39 emit_intrinsic_load_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
,
40 struct ir3_instruction
**dst
)
42 struct ir3_block
*b
= ctx
->block
;
43 struct ir3_instruction
*ldgb
, *src0
, *src1
, *byte_offset
, *offset
;
45 struct ir3_instruction
*ssbo
= ir3_ssbo_to_ibo(ctx
, intr
->src
[0]);
47 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
48 offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
50 /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
51 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
57 ldgb
= ir3_LDGB(b
, ssbo
, 0,
59 ldgb
->regs
[0]->wrmask
= MASK(intr
->num_components
);
60 ldgb
->cat6
.iim_val
= intr
->num_components
;
62 ldgb
->cat6
.type
= TYPE_U32
;
63 ldgb
->barrier_class
= IR3_BARRIER_BUFFER_R
;
64 ldgb
->barrier_conflict
= IR3_BARRIER_BUFFER_W
;
66 ir3_split_dest(b
, dst
, ldgb
, 0, intr
->num_components
);
69 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
71 emit_intrinsic_store_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
73 struct ir3_block
*b
= ctx
->block
;
74 struct ir3_instruction
*stgb
, *src0
, *src1
, *src2
, *byte_offset
, *offset
;
75 unsigned wrmask
= nir_intrinsic_write_mask(intr
);
76 unsigned ncomp
= ffs(~wrmask
) - 1;
78 assert(wrmask
== BITFIELD_MASK(intr
->num_components
));
80 struct ir3_instruction
*ssbo
= ir3_ssbo_to_ibo(ctx
, intr
->src
[1]);
82 byte_offset
= ir3_get_src(ctx
, &intr
->src
[2])[0];
83 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
85 /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
88 src0
= ir3_create_collect(ctx
, ir3_get_src(ctx
, &intr
->src
[0]), ncomp
);
90 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
95 stgb
= ir3_STGB(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
96 stgb
->cat6
.iim_val
= ncomp
;
98 stgb
->cat6
.type
= TYPE_U32
;
99 stgb
->barrier_class
= IR3_BARRIER_BUFFER_W
;
100 stgb
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
102 array_insert(b
, b
->keeps
, stgb
);
106 * SSBO atomic intrinsics
108 * All of the SSBO atomic memory operations read a value from memory,
109 * compute a new value using one of the operations below, write the new
110 * value to memory, and return the original value read.
112 * All operations take 3 sources except CompSwap that takes 4. These
115 * 0: The SSBO buffer index.
116 * 1: The offset into the SSBO buffer of the variable that the atomic
117 * operation will operate on.
118 * 2: The data parameter to the atomic function (i.e. the value to add
119 * in ssbo_atomic_add, etc).
120 * 3: For CompSwap only: the second data parameter.
122 static struct ir3_instruction
*
123 emit_intrinsic_atomic_ssbo(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
125 struct ir3_block
*b
= ctx
->block
;
126 struct ir3_instruction
*atomic
, *ssbo
, *src0
, *src1
, *src2
, *byte_offset
,
128 type_t type
= TYPE_U32
;
130 ssbo
= ir3_ssbo_to_ibo(ctx
, intr
->src
[0]);
132 byte_offset
= ir3_get_src(ctx
, &intr
->src
[1])[0];
133 offset
= ir3_get_src(ctx
, &intr
->src
[3])[0];
135 /* src0 is data (or uvec2(data, compare))
137 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
139 * Note that nir already multiplies the offset by four
141 src0
= ir3_get_src(ctx
, &intr
->src
[2])[0];
143 src2
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
148 switch (intr
->intrinsic
) {
149 case nir_intrinsic_ssbo_atomic_add_ir3
:
150 atomic
= ir3_ATOMIC_ADD_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
152 case nir_intrinsic_ssbo_atomic_imin_ir3
:
153 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
156 case nir_intrinsic_ssbo_atomic_umin_ir3
:
157 atomic
= ir3_ATOMIC_MIN_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
159 case nir_intrinsic_ssbo_atomic_imax_ir3
:
160 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
163 case nir_intrinsic_ssbo_atomic_umax_ir3
:
164 atomic
= ir3_ATOMIC_MAX_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
166 case nir_intrinsic_ssbo_atomic_and_ir3
:
167 atomic
= ir3_ATOMIC_AND_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
169 case nir_intrinsic_ssbo_atomic_or_ir3
:
170 atomic
= ir3_ATOMIC_OR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
172 case nir_intrinsic_ssbo_atomic_xor_ir3
:
173 atomic
= ir3_ATOMIC_XOR_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
175 case nir_intrinsic_ssbo_atomic_exchange_ir3
:
176 atomic
= ir3_ATOMIC_XCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
178 case nir_intrinsic_ssbo_atomic_comp_swap_ir3
:
179 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
180 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
181 ir3_get_src(ctx
, &intr
->src
[3])[0],
184 src1
= ir3_get_src(ctx
, &intr
->src
[4])[0];
185 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, ssbo
, 0, src0
, 0, src1
, 0, src2
, 0);
191 atomic
->cat6
.iim_val
= 1;
193 atomic
->cat6
.type
= type
;
194 atomic
->barrier_class
= IR3_BARRIER_BUFFER_W
;
195 atomic
->barrier_conflict
= IR3_BARRIER_BUFFER_R
| IR3_BARRIER_BUFFER_W
;
197 /* even if nothing consume the result, we can't DCE the instruction: */
198 array_insert(b
, b
->keeps
, atomic
);
203 static struct ir3_instruction
*
204 get_image_offset(struct ir3_context
*ctx
, const nir_intrinsic_instr
*instr
,
205 struct ir3_instruction
* const *coords
, bool byteoff
)
207 struct ir3_block
*b
= ctx
->block
;
208 struct ir3_instruction
*offset
;
209 unsigned index
= nir_src_as_uint(instr
->src
[0]);
210 unsigned ncoords
= ir3_get_image_coords(instr
, NULL
);
212 /* to calculate the byte offset (yes, uggg) we need (up to) three
213 * const values to know the bytes per pixel, and y and z stride:
215 const struct ir3_const_state
*const_state
= ir3_const_state(ctx
->so
);
216 unsigned cb
= regid(const_state
->offsets
.image_dims
, 0) +
217 const_state
->image_dims
.off
[index
];
219 debug_assert(const_state
->image_dims
.mask
& (1 << index
));
221 /* offset = coords.x * bytes_per_pixel: */
222 offset
= ir3_MUL_S24(b
, coords
[0], 0, create_uniform(b
, cb
+ 0), 0);
224 /* offset += coords.y * y_pitch: */
225 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 1), 0,
226 coords
[1], 0, offset
, 0);
229 /* offset += coords.z * z_pitch: */
230 offset
= ir3_MAD_S24(b
, create_uniform(b
, cb
+ 2), 0,
231 coords
[2], 0, offset
, 0);
235 /* Some cases, like atomics, seem to use dword offset instead
236 * of byte offsets.. blob just puts an extra shr.b in there
239 offset
= ir3_SHR_B(b
, offset
, 0, create_immed(b
, 2), 0);
242 return ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
248 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
250 emit_intrinsic_store_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
252 struct ir3_block
*b
= ctx
->block
;
253 struct ir3_instruction
*stib
, *offset
;
254 struct ir3_instruction
* const *value
= ir3_get_src(ctx
, &intr
->src
[3]);
255 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
256 struct ir3_instruction
* ibo
= ir3_image_to_ibo(ctx
, intr
->src
[0]);
257 unsigned ncoords
= ir3_get_image_coords(intr
, NULL
);
258 unsigned ncomp
= ir3_get_num_components_for_image_format(nir_intrinsic_format(intr
));
262 * src2 is 64b byte offset
265 offset
= get_image_offset(ctx
, intr
, coords
, true);
267 /* NOTE: stib seems to take byte offset, but stgb.typed can be used
268 * too and takes a dword offset.. not quite sure yet why blob uses
269 * one over the other in various cases.
272 stib
= ir3_STIB(b
, ibo
, 0,
273 ir3_create_collect(ctx
, value
, ncomp
), 0,
274 ir3_create_collect(ctx
, coords
, ncoords
), 0,
276 stib
->cat6
.iim_val
= ncomp
;
277 stib
->cat6
.d
= ncoords
;
278 stib
->cat6
.type
= ir3_get_type_for_image_intrinsic(intr
);
279 stib
->cat6
.typed
= true;
280 stib
->barrier_class
= IR3_BARRIER_IMAGE_W
;
281 stib
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
283 array_insert(b
, b
->keeps
, stib
);
286 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
287 static struct ir3_instruction
*
288 emit_intrinsic_atomic_image(struct ir3_context
*ctx
, nir_intrinsic_instr
*intr
)
290 struct ir3_block
*b
= ctx
->block
;
291 struct ir3_instruction
*atomic
, *src0
, *src1
, *src2
;
292 struct ir3_instruction
* const *coords
= ir3_get_src(ctx
, &intr
->src
[1]);
293 struct ir3_instruction
* image
= ir3_image_to_ibo(ctx
, intr
->src
[0]);
294 unsigned ncoords
= ir3_get_image_coords(intr
, NULL
);
296 /* src0 is value (or uvec2(value, compare))
298 * src2 is 64b byte offset
300 src0
= ir3_get_src(ctx
, &intr
->src
[3])[0];
301 src1
= ir3_create_collect(ctx
, coords
, ncoords
);
302 src2
= get_image_offset(ctx
, intr
, coords
, false);
304 switch (intr
->intrinsic
) {
305 case nir_intrinsic_image_atomic_add
:
306 atomic
= ir3_ATOMIC_ADD_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
308 case nir_intrinsic_image_atomic_imin
:
309 case nir_intrinsic_image_atomic_umin
:
310 atomic
= ir3_ATOMIC_MIN_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
312 case nir_intrinsic_image_atomic_imax
:
313 case nir_intrinsic_image_atomic_umax
:
314 atomic
= ir3_ATOMIC_MAX_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
316 case nir_intrinsic_image_atomic_and
:
317 atomic
= ir3_ATOMIC_AND_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
319 case nir_intrinsic_image_atomic_or
:
320 atomic
= ir3_ATOMIC_OR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
322 case nir_intrinsic_image_atomic_xor
:
323 atomic
= ir3_ATOMIC_XOR_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
325 case nir_intrinsic_image_atomic_exchange
:
326 atomic
= ir3_ATOMIC_XCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
328 case nir_intrinsic_image_atomic_comp_swap
:
329 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
330 src0
= ir3_create_collect(ctx
, (struct ir3_instruction
*[]){
331 ir3_get_src(ctx
, &intr
->src
[4])[0],
334 atomic
= ir3_ATOMIC_CMPXCHG_G(b
, image
, 0, src0
, 0, src1
, 0, src2
, 0);
340 atomic
->cat6
.iim_val
= 1;
341 atomic
->cat6
.d
= ncoords
;
342 atomic
->cat6
.type
= ir3_get_type_for_image_intrinsic(intr
);
343 atomic
->cat6
.typed
= true;
344 atomic
->barrier_class
= IR3_BARRIER_IMAGE_W
;
345 atomic
->barrier_conflict
= IR3_BARRIER_IMAGE_R
| IR3_BARRIER_IMAGE_W
;
347 /* even if nothing consume the result, we can't DCE the instruction: */
348 array_insert(b
, b
->keeps
, atomic
);
353 const struct ir3_context_funcs ir3_a4xx_funcs
= {
354 .emit_intrinsic_load_ssbo
= emit_intrinsic_load_ssbo
,
355 .emit_intrinsic_store_ssbo
= emit_intrinsic_store_ssbo
,
356 .emit_intrinsic_atomic_ssbo
= emit_intrinsic_atomic_ssbo
,
357 .emit_intrinsic_store_image
= emit_intrinsic_store_image
,
358 .emit_intrinsic_atomic_image
= emit_intrinsic_atomic_image
,
359 .emit_intrinsic_image_size
= emit_intrinsic_image_size_tex
,