const bool ubo_progress = !key && OPT(s, ir3_nir_analyze_ubo_ranges, shader);
const bool idiv_progress = OPT(s, nir_lower_idiv, nir_lower_idiv_fast);
/* UBO offset lowering has to come after we've decided what will be left as load_ubo */
- OPT_V(s, ir3_nir_lower_io_offsets);
+ OPT_V(s, ir3_nir_lower_io_offsets, shader->compiler->gpu_id);
if (ubo_progress || idiv_progress)
ir3_optimize_loop(s);
MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
}
- const_state->num_ubos = nir->info.num_ubos;
+ /* On a6xx, we use UBO descriptors and LDC instead of UBO pointers in the
+ * constbuf.
+ */
+ if (compiler->gpu_id >= 600)
+ shader->num_ubos = nir->info.num_ubos;
+ else
+ const_state->num_ubos = nir->info.num_ubos;
/* num_driver_params is scalar, align to vec4: */
const_state->num_driver_params = align(const_state->num_driver_params, 4);
}
static bool
-lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b)
+lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b, int gpu_id)
{
- /* We only need to lower offset if using LDC. Currently, we only use LDC
- * in the bindless mode. Also, LDC is introduced on A6xx, but currently we
- * only use bindless in turnip which is A6xx only.
- *
- * TODO: We should be using LDC always on A6xx+.
+ /* We only need to lower offset if using LDC, which takes an offset in
+ * vec4 units and has the start component baked into the instruction.
*/
- if (!ir3_bindless_resource(intrinsic->src[0]))
+ if (gpu_id < 600)
return false;
/* TODO handle other bitsizes, including non-dword-aligned loads */
}
static bool
-lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
+lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
{
bool progress = false;
/* UBO */
if (intr->intrinsic == nir_intrinsic_load_ubo) {
- progress |= lower_offset_for_ubo(intr, b);
+ progress |= lower_offset_for_ubo(intr, b, gpu_id);
continue;
}
}
static bool
-lower_io_offsets_func(nir_function_impl *impl)
+lower_io_offsets_func(nir_function_impl *impl, int gpu_id)
{
void *mem_ctx = ralloc_parent(impl);
nir_builder b;
bool progress = false;
nir_foreach_block_safe (block, impl) {
- progress |= lower_io_offsets_block(block, &b, mem_ctx);
+ progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
}
if (progress) {
}
bool
-ir3_nir_lower_io_offsets(nir_shader *shader)
+ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id)
{
bool progress = false;
nir_foreach_function (function, shader) {
if (function->impl)
- progress |= lower_io_offsets_func(function->impl);
+ progress |= lower_io_offsets_func(function->impl, gpu_id);
}
return progress;
}
}
-static void
-fd6_emit_const_bo(struct fd_ringbuffer *ring, gl_shader_stage type,
- uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets)
-{
- uint32_t anum = align(num, 2);
- uint32_t i;
-
- debug_assert((regid % 4) == 0);
-
- OUT_PKT7(ring, fd6_stage2opcode(type), 3 + (2 * anum));
- OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS)|
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
- OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
- for (i = 0; i < num; i++) {
- if (prscs[i]) {
- OUT_RELOC(ring, fd_resource(prscs[i])->bo, offsets[i], 0, 0);
- } else {
- OUT_RING(ring, 0xbad00000 | (i << 16));
- OUT_RING(ring, 0xbad00000 | (i << 16));
- }
- }
-
- for (; i < anum; i++) {
- OUT_RING(ring, 0xffffffff);
- OUT_RING(ring, 0xffffffff);
- }
-}
-
static bool
is_stateobj(struct fd_ringbuffer *ring)
{
const struct ir3_shader_variant *v, uint32_t dst_offset,
uint32_t num, struct pipe_resource **prscs, uint32_t *offsets)
{
- /* TODO inline this */
- assert(dst_offset + num < v->constlen * 4);
- fd6_emit_const_bo(ring, v->type, dst_offset, num, prscs, offsets);
+ unreachable("shouldn't be called on a6xx");
}
static void
fd6_emit_take_group(emit, constobj, FD6_GROUP_PRIMITIVE_PARAMS, ENABLE_ALL);
}
+static void
+fd6_emit_ubos(const struct ir3_shader_variant *v,
+ struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
+{
+ if (!v->shader->num_ubos)
+ return;
+
+ int num_ubos = v->shader->num_ubos;
+
+ OUT_PKT7(ring, fd6_stage2opcode(v->type), 3 + (2 * num_ubos));
+ OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(v->type)) |
+ CP_LOAD_STATE6_0_NUM_UNIT(num_ubos));
+ OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+ OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+
+ for (int i = 0; i < num_ubos; i++) {
+ /* Note: gallium constbuf 0 was always lowered to hardware constbuf,
+ * and UBO load indices decremented by one.
+ */
+ struct pipe_constant_buffer *cb = &constbuf->cb[i + 1];
+ if (cb->buffer) {
+ int size_vec4s = DIV_ROUND_UP(cb->buffer_size, 16);
+ OUT_RELOC(ring, fd_resource(cb->buffer)->bo,
+ cb->buffer_offset,
+ (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+ 0);
+ } else {
+ OUT_RING(ring, 0xbad00000 | (i << 16));
+ OUT_RING(ring, 0xbad00000 | (i << 16));
+ }
+ }
+}
+
static void
emit_user_consts(struct fd6_emit *emit)
{
if (!variants[i])
continue;
ir3_emit_user_consts(ctx->screen, variants[i], constobj, &ctx->constbuf[types[i]]);
- ir3_emit_ubos(ctx->screen, variants[i], constobj, &ctx->constbuf[types[i]]);
+ fd6_emit_ubos(variants[i], constobj, &ctx->constbuf[types[i]]);
}
fd6_emit_take_group(emit, constobj, FD6_GROUP_CONST, ENABLE_ALL);
struct fd_context *ctx, const struct pipe_grid_info *info)
{
ir3_emit_cs_consts(v, ring, ctx, info);
+ fd6_emit_ubos(v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]);
}
void