2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
27 #include "pipe/p_state.h"
28 #include "util/u_string.h"
29 #include "util/u_memory.h"
30 #include "util/u_inlines.h"
31 #include "util/u_format.h"
32 #include "tgsi/tgsi_dump.h"
33 #include "tgsi/tgsi_parse.h"
35 #include "nir/tgsi_to_nir.h"
37 #include "freedreno_context.h"
38 #include "freedreno_util.h"
40 #include "ir3/ir3_shader.h"
41 #include "ir3/ir3_gallium.h"
42 #include "ir3/ir3_compiler.h"
43 #include "ir3/ir3_nir.h"
46 dump_shader_info(struct ir3_shader_variant
*v
, struct pipe_debug_callback
*debug
)
48 if (!unlikely(fd_mesa_debug
& FD_DBG_SHADERDB
))
51 pipe_debug_message(debug
, SHADER_INFO
, "\n"
52 "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n"
53 "SHADER-DB: %s prog %d/%d: %u half, %u full\n"
54 "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n"
55 "SHADER-DB: %s prog %d/%d: %u (ss), %u (sy)\n",
56 ir3_shader_stage(v
->shader
),
60 ir3_shader_stage(v
->shader
),
62 v
->info
.max_half_reg
+ 1,
64 ir3_shader_stage(v
->shader
),
66 v
->info
.max_const
+ 1,
68 ir3_shader_stage(v
->shader
),
70 v
->info
.ss
, v
->info
.sy
);
73 struct ir3_shader_variant
*
74 ir3_shader_variant(struct ir3_shader
*shader
, struct ir3_shader_key key
,
75 bool binning_pass
, struct pipe_debug_callback
*debug
)
77 struct ir3_shader_variant
*v
;
80 /* some shader key values only apply to vertex or frag shader,
81 * so normalize the key to avoid constructing multiple identical
84 ir3_normalize_key(&key
, shader
->type
);
86 v
= ir3_shader_get_variant(shader
, &key
, binning_pass
, &created
);
89 dump_shader_info(v
, debug
);
96 copy_stream_out(struct ir3_stream_output_info
*i
,
97 const struct pipe_stream_output_info
*p
)
99 STATIC_ASSERT(ARRAY_SIZE(i
->stride
) == ARRAY_SIZE(p
->stride
));
100 STATIC_ASSERT(ARRAY_SIZE(i
->output
) == ARRAY_SIZE(p
->output
));
102 i
->num_outputs
= p
->num_outputs
;
103 for (int n
= 0; n
< ARRAY_SIZE(i
->stride
); n
++)
104 i
->stride
[n
] = p
->stride
[n
];
106 for (int n
= 0; n
< ARRAY_SIZE(i
->output
); n
++) {
107 i
->output
[n
].register_index
= p
->output
[n
].register_index
;
108 i
->output
[n
].start_component
= p
->output
[n
].start_component
;
109 i
->output
[n
].num_components
= p
->output
[n
].num_components
;
110 i
->output
[n
].output_buffer
= p
->output
[n
].output_buffer
;
111 i
->output
[n
].dst_offset
= p
->output
[n
].dst_offset
;
112 i
->output
[n
].stream
= p
->output
[n
].stream
;
117 ir3_shader_create(struct ir3_compiler
*compiler
,
118 const struct pipe_shader_state
*cso
, gl_shader_stage type
,
119 struct pipe_debug_callback
*debug
)
122 if (cso
->type
== PIPE_SHADER_IR_NIR
) {
123 /* we take ownership of the reference: */
126 debug_assert(cso
->type
== PIPE_SHADER_IR_TGSI
);
127 if (ir3_shader_debug
& IR3_DBG_DISASM
) {
128 tgsi_dump(cso
->tokens
, 0);
130 nir
= ir3_tgsi_to_nir(compiler
, cso
->tokens
);
133 struct ir3_shader
*shader
= ir3_shader_from_nir(compiler
, nir
);
135 copy_stream_out(&shader
->stream_output
, &cso
->stream_output
);
137 if (fd_mesa_debug
& FD_DBG_SHADERDB
) {
138 /* if shader-db run, create a standard variant immediately
139 * (as otherwise nothing will trigger the shader to be
142 static struct ir3_shader_key key
;
143 memset(&key
, 0, sizeof(key
));
144 ir3_shader_variant(shader
, key
, false, debug
);
149 /* a bit annoying that compute-shader and normal shader state objects
150 * aren't a bit more aligned.
153 ir3_shader_create_compute(struct ir3_compiler
*compiler
,
154 const struct pipe_compute_state
*cso
,
155 struct pipe_debug_callback
*debug
)
158 if (cso
->ir_type
== PIPE_SHADER_IR_NIR
) {
159 /* we take ownership of the reference: */
160 nir
= (nir_shader
*)cso
->prog
;
162 debug_assert(cso
->ir_type
== PIPE_SHADER_IR_TGSI
);
163 if (ir3_shader_debug
& IR3_DBG_DISASM
) {
164 tgsi_dump(cso
->prog
, 0);
166 nir
= ir3_tgsi_to_nir(compiler
, cso
->prog
);
169 struct ir3_shader
*shader
= ir3_shader_from_nir(compiler
, nir
);
175 ir3_tgsi_to_nir(struct ir3_compiler
*compiler
, const struct tgsi_token
*tokens
)
177 return tgsi_to_nir(tokens
, ir3_get_compiler_options(compiler
));
180 /* This has to reach into the fd_context a bit more than the rest of
181 * ir3, but it needs to be aligned with the compiler, so both agree
182 * on which const regs hold what. And the logic is identical between
183 * a3xx/a4xx, the only difference is small details in the actual
184 * CP_LOAD_STATE packets (which is handled inside the generation
185 * specific ctx->emit_const(_bo)() fxns)
188 #include "freedreno_resource.h"
191 is_stateobj(struct fd_ringbuffer
*ring
)
193 /* XXX this is an ugly way to differentiate.. */
194 return !!(ring
->flags
& FD_RINGBUFFER_STREAMING
);
198 ring_wfi(struct fd_batch
*batch
, struct fd_ringbuffer
*ring
)
200 /* when we emit const state via ring (IB2) we need a WFI, but when
201 * it is emit'd via stateobj, we don't
203 if (is_stateobj(ring
))
210 emit_user_consts(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
211 struct fd_ringbuffer
*ring
, struct fd_constbuf_stateobj
*constbuf
)
213 const unsigned index
= 0; /* user consts are index 0 */
215 if (constbuf
->enabled_mask
& (1 << index
)) {
216 struct pipe_constant_buffer
*cb
= &constbuf
->cb
[index
];
217 unsigned size
= align(cb
->buffer_size
, 4) / 4; /* size in dwords */
219 /* in particular, with binning shader we may end up with
220 * unused consts, ie. we could end up w/ constlen that is
221 * smaller than first_driver_param. In that case truncate
222 * the user consts early to avoid HLSQ lockup caused by
223 * writing too many consts
225 uint32_t max_const
= MIN2(v
->num_uniforms
, v
->constlen
);
227 // I expect that size should be a multiple of vec4's:
228 assert(size
== align(size
, 4));
230 /* and even if the start of the const buffer is before
231 * first_immediate, the end may not be:
233 size
= MIN2(size
, 4 * max_const
);
236 ring_wfi(ctx
->batch
, ring
);
237 ctx
->emit_const(ring
, v
->type
, 0,
238 cb
->buffer_offset
, size
,
239 cb
->user_buffer
, cb
->buffer
);
245 emit_ubos(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
246 struct fd_ringbuffer
*ring
, struct fd_constbuf_stateobj
*constbuf
)
248 uint32_t offset
= v
->constbase
.ubo
;
249 if (v
->constlen
> offset
) {
250 uint32_t params
= v
->num_ubos
;
251 uint32_t offsets
[params
];
252 struct pipe_resource
*prscs
[params
];
254 for (uint32_t i
= 0; i
< params
; i
++) {
255 const uint32_t index
= i
+ 1; /* UBOs start at index 1 */
256 struct pipe_constant_buffer
*cb
= &constbuf
->cb
[index
];
257 assert(!cb
->user_buffer
);
259 if ((constbuf
->enabled_mask
& (1 << index
)) && cb
->buffer
) {
260 offsets
[i
] = cb
->buffer_offset
;
261 prscs
[i
] = cb
->buffer
;
268 ring_wfi(ctx
->batch
, ring
);
269 ctx
->emit_const_bo(ring
, v
->type
, false, offset
* 4, params
, prscs
, offsets
);
274 emit_ssbo_sizes(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
275 struct fd_ringbuffer
*ring
, struct fd_shaderbuf_stateobj
*sb
)
277 uint32_t offset
= v
->constbase
.ssbo_sizes
;
278 if (v
->constlen
> offset
) {
279 uint32_t sizes
[align(v
->const_layout
.ssbo_size
.count
, 4)];
280 unsigned mask
= v
->const_layout
.ssbo_size
.mask
;
283 unsigned index
= u_bit_scan(&mask
);
284 unsigned off
= v
->const_layout
.ssbo_size
.off
[index
];
285 sizes
[off
] = sb
->sb
[index
].buffer_size
;
288 ring_wfi(ctx
->batch
, ring
);
289 ctx
->emit_const(ring
, v
->type
, offset
* 4,
290 0, ARRAY_SIZE(sizes
), sizes
, NULL
);
295 emit_image_dims(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
296 struct fd_ringbuffer
*ring
, struct fd_shaderimg_stateobj
*si
)
298 uint32_t offset
= v
->constbase
.image_dims
;
299 if (v
->constlen
> offset
) {
300 uint32_t dims
[align(v
->const_layout
.image_dims
.count
, 4)];
301 unsigned mask
= v
->const_layout
.image_dims
.mask
;
304 struct pipe_image_view
*img
;
305 struct fd_resource
*rsc
;
306 unsigned index
= u_bit_scan(&mask
);
307 unsigned off
= v
->const_layout
.image_dims
.off
[index
];
309 img
= &si
->si
[index
];
310 rsc
= fd_resource(img
->resource
);
312 dims
[off
+ 0] = util_format_get_blocksize(img
->format
);
313 if (img
->resource
->target
!= PIPE_BUFFER
) {
314 unsigned lvl
= img
->u
.tex
.level
;
315 /* note for 2d/cube/etc images, even if re-interpreted
316 * as a different color format, the pixel size should
317 * be the same, so use original dimensions for y and z
320 dims
[off
+ 1] = rsc
->slices
[lvl
].pitch
* rsc
->cpp
;
321 /* see corresponding logic in fd_resource_offset(): */
322 if (rsc
->layer_first
) {
323 dims
[off
+ 2] = rsc
->layer_size
;
325 dims
[off
+ 2] = rsc
->slices
[lvl
].size0
;
328 /* For buffer-backed images, the log2 of the format's
329 * bytes-per-pixel is placed on the 2nd slot. This is useful
330 * when emitting image_size instructions, for which we need
331 * to divide by bpp for image buffers. Since the bpp
332 * can only be power-of-two, the division is implemented
333 * as a SHR, and for that it is handy to have the log2 of
334 * bpp as a constant. (log2 = first-set-bit - 1)
336 dims
[off
+ 1] = ffs(dims
[off
+ 0]) - 1;
340 ring_wfi(ctx
->batch
, ring
);
341 ctx
->emit_const(ring
, v
->type
, offset
* 4,
342 0, ARRAY_SIZE(dims
), dims
, NULL
);
347 emit_immediates(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
348 struct fd_ringbuffer
*ring
)
350 int size
= v
->immediates_count
;
351 uint32_t base
= v
->constbase
.immediate
;
353 /* truncate size to avoid writing constants that shader
356 size
= MIN2(size
+ base
, v
->constlen
) - base
;
358 /* convert out of vec4: */
363 ring_wfi(ctx
->batch
, ring
);
364 ctx
->emit_const(ring
, v
->type
, base
,
365 0, size
, v
->immediates
[0].val
, NULL
);
369 /* emit stream-out buffers: */
371 emit_tfbos(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
,
372 struct fd_ringbuffer
*ring
)
374 /* streamout addresses after driver-params: */
375 uint32_t offset
= v
->constbase
.tfbo
;
376 if (v
->constlen
> offset
) {
377 struct fd_streamout_stateobj
*so
= &ctx
->streamout
;
378 struct ir3_stream_output_info
*info
= &v
->shader
->stream_output
;
380 uint32_t offsets
[params
];
381 struct pipe_resource
*prscs
[params
];
383 for (uint32_t i
= 0; i
< params
; i
++) {
384 struct pipe_stream_output_target
*target
= so
->targets
[i
];
387 offsets
[i
] = (so
->offsets
[i
] * info
->stride
[i
] * 4) +
388 target
->buffer_offset
;
389 prscs
[i
] = target
->buffer
;
396 ring_wfi(ctx
->batch
, ring
);
397 ctx
->emit_const_bo(ring
, v
->type
, true, offset
* 4, params
, prscs
, offsets
);
402 max_tf_vtx(struct fd_context
*ctx
, const struct ir3_shader_variant
*v
)
404 struct fd_streamout_stateobj
*so
= &ctx
->streamout
;
405 struct ir3_stream_output_info
*info
= &v
->shader
->stream_output
;
406 uint32_t maxvtxcnt
= 0x7fffffff;
408 if (ctx
->screen
->gpu_id
>= 500)
412 if (v
->shader
->stream_output
.num_outputs
== 0)
414 if (so
->num_targets
== 0)
417 /* offset to write to is:
419 * total_vtxcnt = vtxcnt + offsets[i]
420 * offset = total_vtxcnt * stride[i]
422 * offset = vtxcnt * stride[i] ; calculated in shader
423 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
425 * assuming for each vtx, each target buffer will have data written
426 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
428 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
429 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
431 * but shader is actually doing a less-than (rather than less-than-
432 * equal) check, so we can drop the -stride[i].
434 * TODO is assumption about `offset + stride[i]` legit?
436 for (unsigned i
= 0; i
< so
->num_targets
; i
++) {
437 struct pipe_stream_output_target
*target
= so
->targets
[i
];
438 unsigned stride
= info
->stride
[i
] * 4; /* convert dwords->bytes */
440 uint32_t max
= target
->buffer_size
/ stride
;
441 maxvtxcnt
= MIN2(maxvtxcnt
, max
);
449 emit_common_consts(const struct ir3_shader_variant
*v
, struct fd_ringbuffer
*ring
,
450 struct fd_context
*ctx
, enum pipe_shader_type t
)
452 enum fd_dirty_shader_state dirty
= ctx
->dirty_shader
[t
];
454 /* When we use CP_SET_DRAW_STATE objects to emit constant state,
455 * if we emit any of it we need to emit all. This is because
456 * we are using the same state-group-id each time for uniform
457 * state, and if previous update is never evaluated (due to no
458 * visible primitives in the current tile) then the new stateobj
459 * completely replaces the old one.
461 * Possibly if we split up different parts of the const state to
462 * different state-objects we could avoid this.
464 if (dirty
&& is_stateobj(ring
))
467 if (dirty
& (FD_DIRTY_SHADER_PROG
| FD_DIRTY_SHADER_CONST
)) {
468 struct fd_constbuf_stateobj
*constbuf
;
471 constbuf
= &ctx
->constbuf
[t
];
472 shader_dirty
= !!(dirty
& FD_DIRTY_SHADER_PROG
);
474 emit_user_consts(ctx
, v
, ring
, constbuf
);
475 emit_ubos(ctx
, v
, ring
, constbuf
);
477 emit_immediates(ctx
, v
, ring
);
480 if (dirty
& (FD_DIRTY_SHADER_PROG
| FD_DIRTY_SHADER_SSBO
)) {
481 struct fd_shaderbuf_stateobj
*sb
= &ctx
->shaderbuf
[t
];
482 emit_ssbo_sizes(ctx
, v
, ring
, sb
);
485 if (dirty
& (FD_DIRTY_SHADER_PROG
| FD_DIRTY_SHADER_IMAGE
)) {
486 struct fd_shaderimg_stateobj
*si
= &ctx
->shaderimg
[t
];
487 emit_image_dims(ctx
, v
, ring
, si
);
492 ir3_emit_vs_consts(const struct ir3_shader_variant
*v
, struct fd_ringbuffer
*ring
,
493 struct fd_context
*ctx
, const struct pipe_draw_info
*info
)
495 debug_assert(v
->type
== MESA_SHADER_VERTEX
);
497 emit_common_consts(v
, ring
, ctx
, PIPE_SHADER_VERTEX
);
499 /* emit driver params every time: */
500 /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
502 uint32_t offset
= v
->constbase
.driver_param
;
503 if (v
->constlen
> offset
) {
504 uint32_t vertex_params
[IR3_DP_VS_COUNT
] = {
505 [IR3_DP_VTXID_BASE
] = info
->index_size
?
506 info
->index_bias
: info
->start
,
507 [IR3_DP_VTXCNT_MAX
] = max_tf_vtx(ctx
, v
),
509 /* if no user-clip-planes, we don't need to emit the
512 uint32_t vertex_params_size
= 4;
514 if (v
->key
.ucp_enables
) {
515 struct pipe_clip_state
*ucp
= &ctx
->ucp
;
516 unsigned pos
= IR3_DP_UCP0_X
;
517 for (unsigned i
= 0; pos
<= IR3_DP_UCP7_W
; i
++) {
518 for (unsigned j
= 0; j
< 4; j
++) {
519 vertex_params
[pos
] = fui(ucp
->ucp
[i
][j
]);
523 vertex_params_size
= ARRAY_SIZE(vertex_params
);
526 ring_wfi(ctx
->batch
, ring
);
528 bool needs_vtxid_base
=
529 ir3_find_sysval_regid(v
, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
) != regid(63, 0);
531 /* for indirect draw, we need to copy VTXID_BASE from
532 * indirect-draw parameters buffer.. which is annoying
533 * and means we can't easily emit these consts in cmd
534 * stream so need to copy them to bo.
536 if (info
->indirect
&& needs_vtxid_base
) {
537 struct pipe_draw_indirect_info
*indirect
= info
->indirect
;
538 struct pipe_resource
*vertex_params_rsc
=
539 pipe_buffer_create(&ctx
->screen
->base
,
540 PIPE_BIND_CONSTANT_BUFFER
, PIPE_USAGE_STREAM
,
541 vertex_params_size
* 4);
542 unsigned src_off
= info
->indirect
->offset
;;
545 ptr
= fd_bo_map(fd_resource(vertex_params_rsc
)->bo
);
546 memcpy(ptr
, vertex_params
, vertex_params_size
* 4);
548 if (info
->index_size
) {
549 /* indexed draw, index_bias is 4th field: */
552 /* non-indexed draw, start is 3rd field: */
556 /* copy index_bias or start from draw params: */
557 ctx
->mem_to_mem(ring
, vertex_params_rsc
, 0,
558 indirect
->buffer
, src_off
, 1);
560 ctx
->emit_const(ring
, MESA_SHADER_VERTEX
, offset
* 4, 0,
561 vertex_params_size
, NULL
, vertex_params_rsc
);
563 pipe_resource_reference(&vertex_params_rsc
, NULL
);
565 ctx
->emit_const(ring
, MESA_SHADER_VERTEX
, offset
* 4, 0,
566 vertex_params_size
, vertex_params
, NULL
);
569 /* if needed, emit stream-out buffer addresses: */
570 if (vertex_params
[IR3_DP_VTXCNT_MAX
] > 0) {
571 emit_tfbos(ctx
, v
, ring
);
578 ir3_emit_fs_consts(const struct ir3_shader_variant
*v
, struct fd_ringbuffer
*ring
,
579 struct fd_context
*ctx
)
581 debug_assert(v
->type
== MESA_SHADER_FRAGMENT
);
583 emit_common_consts(v
, ring
, ctx
, PIPE_SHADER_FRAGMENT
);
586 /* emit compute-shader consts: */
588 ir3_emit_cs_consts(const struct ir3_shader_variant
*v
, struct fd_ringbuffer
*ring
,
589 struct fd_context
*ctx
, const struct pipe_grid_info
*info
)
591 debug_assert(gl_shader_stage_is_compute(v
->type
));
593 emit_common_consts(v
, ring
, ctx
, PIPE_SHADER_COMPUTE
);
595 /* emit compute-shader driver-params: */
596 uint32_t offset
= v
->constbase
.driver_param
;
597 if (v
->constlen
> offset
) {
598 ring_wfi(ctx
->batch
, ring
);
600 if (info
->indirect
) {
601 struct pipe_resource
*indirect
= NULL
;
602 unsigned indirect_offset
;
604 /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs
605 * to be aligned more strongly than 4 bytes. So in this case
606 * we need a temporary buffer to copy NumWorkGroups.xyz to.
608 * TODO if previous compute job is writing to info->indirect,
609 * we might need a WFI.. but since we currently flush for each
610 * compute job, we are probably ok for now.
612 if (info
->indirect_offset
& 0xf) {
613 indirect
= pipe_buffer_create(&ctx
->screen
->base
,
614 PIPE_BIND_COMMAND_ARGS_BUFFER
, PIPE_USAGE_STREAM
,
618 ctx
->mem_to_mem(ring
, indirect
, 0, info
->indirect
,
619 info
->indirect_offset
, 3);
621 pipe_resource_reference(&indirect
, info
->indirect
);
622 indirect_offset
= info
->indirect_offset
;
625 ctx
->emit_const(ring
, MESA_SHADER_COMPUTE
, offset
* 4,
626 indirect_offset
, 4, NULL
, indirect
);
628 pipe_resource_reference(&indirect
, NULL
);
630 uint32_t compute_params
[IR3_DP_CS_COUNT
] = {
631 [IR3_DP_NUM_WORK_GROUPS_X
] = info
->grid
[0],
632 [IR3_DP_NUM_WORK_GROUPS_Y
] = info
->grid
[1],
633 [IR3_DP_NUM_WORK_GROUPS_Z
] = info
->grid
[2],
634 [IR3_DP_LOCAL_GROUP_SIZE_X
] = info
->block
[0],
635 [IR3_DP_LOCAL_GROUP_SIZE_Y
] = info
->block
[1],
636 [IR3_DP_LOCAL_GROUP_SIZE_Z
] = info
->block
[2],
639 ctx
->emit_const(ring
, MESA_SHADER_COMPUTE
, offset
* 4, 0,
640 ARRAY_SIZE(compute_params
), compute_params
, NULL
);