freedreno/a3xx: add UBO support
authorIlia Mirkin <imirkin@alum.mit.edu>
Tue, 31 Mar 2015 15:51:00 +0000 (11:51 -0400)
committerRob Clark <robclark@freedesktop.org>
Sun, 5 Apr 2015 20:36:35 +0000 (16:36 -0400)
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
docs/relnotes/10.6.0.html
src/gallium/drivers/freedreno/a3xx/fd3_emit.c
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/ir3/ir3.c
src/gallium/drivers/freedreno/ir3/ir3_compiler.c
src/gallium/drivers/freedreno/ir3/ir3_legalize.c
src/gallium/drivers/freedreno/ir3/ir3_shader.c

index 2378e3d52402813f62fcb64ae59ea03af9d38428..f69266a29988a5fe004fe40ac510c42ef26af792 100644 (file)
@@ -50,6 +50,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_gpu_shader_fp64 on nvc0, softpipe</li>
 <li>GL_ARB_instanced_arrays on freedreno</li>
 <li>GL_ARB_pipeline_statistics_query on i965, nv50, nvc0, r600, radeonsi, softpipe</li>
+<li>GL_ARB_uniform_buffer_object on freedreno</li>
 <li>GL_EXT_draw_buffers2 on freedreno</li>
 <li>GL_ARB_clip_control on i965</li>
 </ul>
index 5fd31f50daf4e0d8854f478c2b50de14462a2ee8..f961fc07585efe4e450297a2dcefa1884d0be2d4 100644 (file)
@@ -87,11 +87,12 @@ static void
 emit_constants(struct fd_ringbuffer *ring,
                enum adreno_state_block sb,
                struct fd_constbuf_stateobj *constbuf,
-               struct ir3_shader_variant *shader)
+               struct ir3_shader_variant *shader,
+               bool emit_immediates)
 {
        uint32_t enabled_mask = constbuf->enabled_mask;
-       uint32_t first_immediate;
-       uint32_t base = 0;
+       uint32_t max_const;
+       int i;
 
        // XXX TODO only emit dirty consts.. but we need to keep track if
        // they are clobbered by a clear, gmem2mem, or mem2gmem..
@@ -102,42 +103,57 @@ emit_constants(struct fd_ringbuffer *ring,
         * than first_immediate.  In that case truncate the user consts
         * early to avoid HLSQ lockup caused by writing too many consts
         */
-       first_immediate = MIN2(shader->first_immediate, shader->constlen);
+       max_const = MIN2(shader->first_driver_param, shader->constlen);
 
        /* emit user constants: */
-       while (enabled_mask) {
-               unsigned index = ffs(enabled_mask) - 1;
+       if (enabled_mask & 1) {
+               const unsigned index = 0;
                struct pipe_constant_buffer *cb = &constbuf->cb[index];
                unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
 
                // I expect that size should be a multiple of vec4's:
                assert(size == align(size, 4));
 
-               /* gallium could leave const buffers bound above what the
-                * current shader uses.. don't let that confuse us.
+               /* and even if the start of the const buffer is before
+                * first_immediate, the end may not be:
                 */
-               if (base >= (4 * first_immediate))
-                       break;
+               size = MIN2(size, 4 * max_const);
 
-               if (constbuf->dirty_mask & (1 << index)) {
-                       /* and even if the start of the const buffer is before
-                        * first_immediate, the end may not be:
-                        */
-                       size = MIN2(size, (4 * first_immediate) - base);
-                       fd3_emit_constant(ring, sb, base,
-                                       cb->buffer_offset, size,
-                                       cb->user_buffer, cb->buffer);
+               if (size && constbuf->dirty_mask & (1 << index)) {
+                       fd3_emit_constant(ring, sb, 0,
+                                                         cb->buffer_offset, size,
+                                                         cb->user_buffer, cb->buffer);
                        constbuf->dirty_mask &= ~(1 << index);
                }
 
-               base += size;
                enabled_mask &= ~(1 << index);
        }
 
+       if (shader->constlen > shader->first_driver_param) {
+               uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
+               /* emit ubos: */
+               OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
+               OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) |
+                                CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+                                CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+                                CP_LOAD_STATE_0_NUM_UNIT(params * 2));
+               OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+                                CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+
+               for (i = 1; i <= params * 4; i++) {
+                       struct pipe_constant_buffer *cb = &constbuf->cb[i];
+                       assert(!cb->user_buffer);
+                       if ((enabled_mask & (1 << i)) && cb->buffer)
+                               OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
+                       else
+                               OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
+               }
+       }
+
        /* emit shader immediates: */
-       if (shader) {
+       if (shader && emit_immediates) {
                int size = shader->immediates_count;
-               base = shader->first_immediate;
+               uint32_t base = shader->first_immediate;
 
                /* truncate size to avoid writing constants that shader
                 * does not use:
@@ -619,11 +635,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                fd_wfi(ctx, ring);
                emit_constants(ring,  SB_VERT_SHADER,
                                &ctx->constbuf[PIPE_SHADER_VERTEX],
-                               (emit->prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
+                               vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
                if (!emit->key.binning_pass) {
                        emit_constants(ring, SB_FRAG_SHADER,
                                        &ctx->constbuf[PIPE_SHADER_FRAGMENT],
-                                       (emit->prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+                                       fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
                }
        }
 
@@ -635,8 +651,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                        0,
                        0
                };
-               if (vp->constlen > vp->first_driver_param) {
-                       fd3_emit_constant(ring, SB_VERT_SHADER, vp->first_driver_param * 4,
+               if (vp->constlen >= vp->first_driver_param + 4) {
+                       fd3_emit_constant(ring, SB_VERT_SHADER,
+                                                         (vp->first_driver_param + 4) * 4,
                                                          0, 4, vertex_params, NULL);
                }
        }
index fe724442c0720f3e9449e632a90bc92e2b1721d6..66fe0e571cf8013328ee7a4608c3ab13871369c1 100644 (file)
@@ -356,9 +356,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 * split between VS and FS.  Use lower limit of 256 to
                 * avoid getting into impossible situations:
                 */
-               return ((is_a3xx(screen) || is_a4xx(screen)) ? 256 : 64) * sizeof(float[4]);
+               return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]);
        case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-               return 1;
+               return is_a3xx(screen) ? 16 : 1;
        case PIPE_SHADER_CAP_MAX_PREDS:
                return 0; /* nothing uses this */
        case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
index 42a45776211e7ae19695f4f7f36b5cce96e68351..284c6559eb1a745ba5d6e69597725bed9180e673 100644 (file)
@@ -487,7 +487,7 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 
        iassert(instr->regs_count >= 2);
 
-       if (instr->cat6.offset) {
+       if (instr->cat6.offset || instr->opc == OPC_LDG) {
                instr_cat6a_t *cat6a = ptr;
 
                cat6->has_off = true;
index 38df8b5fdf4f20d228eafe19bad7d1d25491c0fd..43f4c955ac065ad5a673a658a7a223dcb72dff8b 100644 (file)
@@ -151,6 +151,7 @@ static void vectorize(struct ir3_compile_context *ctx,
 static void create_mov(struct ir3_compile_context *ctx,
                struct tgsi_dst_register *dst, struct tgsi_src_register *src);
 static type_t get_ftype(struct ir3_compile_context *ctx);
+static type_t get_utype(struct ir3_compile_context *ctx);
 
 static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i)
 {
@@ -252,7 +253,7 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
         * the assembler what the max addr reg value can be:
         */
        if (info->indirect_files & FM(CONSTANT))
-               so->constlen = ctx->info.file_max[TGSI_FILE_CONSTANT] + 1;
+               so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1);
 
        i = 0;
        i += setup_arrays(ctx, TGSI_FILE_INPUT, i);
@@ -261,12 +262,13 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
        /* any others? we don't track arrays for const..*/
 
        /* Immediates go after constants: */
-       if (so->type == SHADER_VERTEX) {
-               so->first_driver_param = info->file_max[TGSI_FILE_CONSTANT] + 1;
-               so->first_immediate = so->first_driver_param + 1;
-       } else {
-               so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
-       }
+       so->first_immediate = so->first_driver_param =
+               info->const_file_max[0] + 1;
+       /* 1 unit for the vertex id base */
+       if (so->type == SHADER_VERTEX)
+               so->first_immediate++;
+       /* 4 (vec4) units for ubo base addresses */
+       so->first_immediate += 4;
        ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
 
        ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
@@ -717,6 +719,80 @@ ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
                reg->offset = regid(off, chan);
 
                instr = array_fanin(ctx, aid, src->File);
+       } else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) {
+               const struct tgsi_full_src_register *fsrc = (const void *)src;
+               struct ir3_instruction *temp = NULL;
+               int ubo_regid = regid(ctx->so->first_driver_param, 0) +
+                       fsrc->Dimension.Index - 1;
+               int offset = 0;
+
+               /* We don't handle indirect UBO array accesses... yet. */
+               compile_assert(ctx, !fsrc->Dimension.Indirect);
+               /* UBOs start at index 1. */
+               compile_assert(ctx, fsrc->Dimension.Index > 0);
+
+               if (src->Indirect) {
+                       /* In case of an indirect index, it will have been loaded into an
+                        * address register. There will be a sequence of
+                        *
+                        *   shl.b x, val, 2
+                        *   mova a0, x
+                        *
+                        * We rely on this sequence to get the original val out and shift
+                        * it by 4, since we're dealing in vec4 units.
+                        */
+                       compile_assert(ctx, ctx->block->address);
+                       compile_assert(ctx, ctx->block->address->regs[1]->instr->opc ==
+                                                  OPC_SHL_B);
+
+                       temp = instr = instr_create(ctx, 2, OPC_SHL_B);
+                       ir3_reg_create(instr, 0, 0);
+                       ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr =
+                               ctx->block->address->regs[1]->instr->regs[1]->instr;
+                       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
+               } else if (src->Index >= 64) {
+                       /* Otherwise it's a plain index (in vec4 units). Move it into a
+                        * register.
+                        */
+                       temp = instr = instr_create(ctx, 1, 0);
+                       instr->cat1.src_type = get_utype(ctx);
+                       instr->cat1.dst_type = get_utype(ctx);
+                       ir3_reg_create(instr, 0, 0);
+                       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16;
+               } else {
+                       /* The offset is small enough to fit into the ldg instruction
+                        * directly.
+                        */
+                       offset = src->Index * 16;
+               }
+
+               if (temp) {
+                       /* If there was an offset (most common), add it to the buffer
+                        * address.
+                        */
+                       instr = instr_create(ctx, 2, OPC_ADD_S);
+                       ir3_reg_create(instr, 0, 0);
+                       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
+                       ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
+               } else {
+                       /* Otherwise just load the buffer address directly */
+                       instr = instr_create(ctx, 1, 0);
+                       instr->cat1.src_type = get_utype(ctx);
+                       instr->cat1.dst_type = get_utype(ctx);
+                       ir3_reg_create(instr, 0, 0);
+                       ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
+               }
+
+               temp = instr;
+
+               instr = instr_create(ctx, 6, OPC_LDG);
+               instr->cat6.type = TYPE_U32;
+               instr->cat6.offset = offset + chan * 4;
+               ir3_reg_create(instr, 0, 0);
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
+               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
+
+               reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST);
        } else {
                /* normal case (not relative addressed GPR) */
                instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan));
@@ -3183,7 +3259,8 @@ decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
                instr->cat1.src_type = get_stype(ctx);
                instr->cat1.dst_type = get_stype(ctx);
                ir3_reg_create(instr, 0, 0);
-               ir3_reg_create(instr, regid(so->first_driver_param, 0), IR3_REG_CONST);
+               ir3_reg_create(instr, regid(so->first_driver_param + 4, 0),
+                                          IR3_REG_CONST);
                break;
        case TGSI_SEMANTIC_INSTANCEID:
                ctx->instance_id = instr = create_input(ctx->block, NULL, r);
index db501e7a51c9ead349bc1e376b50997615f0473f..2455f7e4efc513b0ba337f0f8c7baf83ea7ed5e2 100644 (file)
@@ -175,7 +175,7 @@ static void legalize(struct ir3_legalize_ctx *ctx)
                /* both tex/sfu appear to not always immediately consume
                 * their src register(s):
                 */
-               if (is_tex(n) || is_sfu(n)) {
+               if (is_tex(n) || is_sfu(n) || is_mem(n)) {
                        foreach_src(reg, n) {
                                if (reg_gpr(reg))
                                        regmask_set(&needs_ss_war, reg);
index b1dff3818130afac39a78a35c1bc48a7f271d2aa..0cf357e17d8200ee49f1177a778ade79021971d9 100644 (file)
@@ -116,7 +116,7 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
         * the compiler (to worst-case value) since we don't know in
         * the assembler what the max addr reg value can be:
         */
-       v->constlen = MAX2(v->constlen, v->info.max_const + 1);
+       v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
 
        fixup_regfootprint(v);