freedreno: small bit of cleanup about max rendertargets
[mesa.git] / src / gallium / drivers / freedreno / a4xx / fd4_emit.c
index 0e00e387f14ea4eb3fb6e07003b00ace818982d1..df96601c747c0c19de3d886d8cf34d794863d62f 100644 (file)
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+       [SHADER_VERTEX]   = SB_VERT_SHADER,
+       [SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd4_emit_constant(struct fd_ringbuffer *ring,
-               enum adreno_state_block sb,
+fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
                uint32_t regid, uint32_t offset, uint32_t sizedwords,
                const uint32_t *dwords, struct pipe_resource *prsc)
 {
        uint32_t i, sz;
        enum adreno_state_src src;
 
+       debug_assert((regid % 4) == 0);
+       debug_assert((sizedwords % 4) == 0);
+
        if (prsc) {
                sz = 0;
                src = 0x2;  // TODO ??
@@ -67,7 +74,7 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
        OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
        OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
                        CP_LOAD_STATE_0_STATE_SRC(src) |
-                       CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+                       CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
                        CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
        if (prsc) {
                struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,73 +91,31 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-               enum adreno_state_block sb,
-               struct fd_constbuf_stateobj *constbuf,
-               struct ir3_shader_variant *shader)
+fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+               uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-       uint32_t enabled_mask = constbuf->enabled_mask;
-       uint32_t first_immediate;
-       uint32_t base = 0;
-
-       // XXX TODO only emit dirty consts.. but we need to keep track if
-       // they are clobbered by a clear, gmem2mem, or mem2gmem..
-       constbuf->dirty_mask = enabled_mask;
-
-       /* in particular, with binning shader we may end up with unused
-        * consts, ie. we could end up w/ constlen that is smaller
-        * than first_immediate.  In that case truncate the user consts
-        * early to avoid HLSQ lockup caused by writing too many consts
-        */
-       first_immediate = MIN2(shader->first_immediate, shader->constlen);
-
-       /* emit user constants: */
-       while (enabled_mask) {
-               unsigned index = ffs(enabled_mask) - 1;
-               struct pipe_constant_buffer *cb = &constbuf->cb[index];
-               unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-               // I expect that size should be a multiple of vec4's:
-               assert(size == align(size, 4));
-
-               /* gallium could leave const buffers bound above what the
-                * current shader uses.. don't let that confuse us.
-                */
-               if (base >= (4 * first_immediate))
-                       break;
-
-               if (constbuf->dirty_mask & (1 << index)) {
-                       /* and even if the start of the const buffer is before
-                        * first_immediate, the end may not be:
-                        */
-                       size = MIN2(size, (4 * first_immediate) - base);
-                       fd4_emit_constant(ring, sb, base,
-                                       cb->buffer_offset, size,
-                                       cb->user_buffer, cb->buffer);
-                       constbuf->dirty_mask &= ~(1 << index);
-               }
-
-               base += size;
-               enabled_mask &= ~(1 << index);
-       }
+       uint32_t i;
 
-       /* emit shader immediates: */
-       if (shader) {
-               int size = shader->immediates_count;
-               base = shader->first_immediate;
+       debug_assert((regid % 4) == 0);
+       debug_assert((num % 4) == 0);
 
-               /* truncate size to avoid writing constants that shader
-                * does not use:
-                */
-               size = MIN2(size + base, shader->constlen) - base;
-
-               /* convert out of vec4: */
-               base *= 4;
-               size *= 4;
-
-               if (size > 0) {
-                       fd4_emit_constant(ring, sb, base,
-                               0, size, shader->immediates[0].val, NULL);
+       OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+       OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
+                       CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+                       CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+                       CP_LOAD_STATE_0_NUM_UNIT(num/4));
+       OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+                       CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+
+       for (i = 0; i < num; i++) {
+               if (bos[i]) {
+                       if (write) {
+                               OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+                       } else {
+                               OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+                       }
+               } else {
+                       OUT_RING(ring, 0xbad00000 | (i << 16));
                }
        }
 }
@@ -207,15 +172,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
                        const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
                                        fd4_pipe_sampler_view(tex->textures[i]) :
                                        &dummy_view;
-                       struct fd_resource *rsc = view->tex_resource;
                        unsigned start = view->base.u.tex.first_level;
-                       uint32_t offset = fd_resource_offset(rsc, start, 0);
 
                        OUT_RING(ring, view->texconst0);
                        OUT_RING(ring, view->texconst1);
                        OUT_RING(ring, view->texconst2);
                        OUT_RING(ring, view->texconst3);
-                       OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+                       if (view->base.texture) {
+                               struct fd_resource *rsc = fd_resource(view->base.texture);
+                               uint32_t offset = fd_resource_offset(rsc, start, 0);
+                               OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+                       } else {
+                               OUT_RING(ring, 0x00000000);
+                       }
                        OUT_RING(ring, 0x00000000);
                        OUT_RING(ring, 0x00000000);
                        OUT_RING(ring, 0x00000000);
@@ -278,21 +247,35 @@ fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf
 void
 fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 {
-       uint32_t i, j, last = 0;
+       int32_t i, j, last = -1;
        uint32_t total_in = 0;
        const struct fd_vertex_state *vtx = emit->vtx;
        struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
-       unsigned n = MIN2(vtx->vtx->num_elements, vp->inputs_count);
+       unsigned vertex_regid = regid(63, 0);
+       unsigned instance_regid = regid(63, 0);
+       unsigned vtxcnt_regid = regid(63, 0);
+
+       for (i = 0; i < vp->inputs_count; i++) {
+               uint8_t semantic = sem2name(vp->inputs[i].semantic);
+               if (semantic == TGSI_SEMANTIC_VERTEXID_NOBASE)
+                       vertex_regid = vp->inputs[i].regid;
+               else if (semantic == TGSI_SEMANTIC_INSTANCEID)
+                       instance_regid = vp->inputs[i].regid;
+               else if (semantic == IR3_SEMANTIC_VTXCNT)
+                       vtxcnt_regid = vp->inputs[i].regid;
+               else if ((i < vtx->vtx->num_elements) && vp->inputs[i].compmask)
+                       last = i;
+       }
 
        /* hw doesn't like to be configured for zero vbo's, it seems: */
-       if (vtx->vtx->num_elements == 0)
+       if ((vtx->vtx->num_elements == 0) &&
+                       (vertex_regid == regid(63, 0)) &&
+                       (instance_regid == regid(63, 0)) &&
+                       (vtxcnt_regid == regid(63, 0)))
                return;
 
-       for (i = 0; i < n; i++)
-               if (vp->inputs[i].compmask)
-                       last = i;
-
        for (i = 0, j = 0; i <= last; i++) {
+               assert(sem2name(vp->inputs[i].semantic) == 0);
                if (vp->inputs[i].compmask) {
                        struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
                        const struct pipe_vertex_buffer *vb =
@@ -300,7 +283,11 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
                        struct fd_resource *rsc = fd_resource(vb->buffer);
                        enum pipe_format pfmt = elem->src_format;
                        enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt);
-                       bool switchnext = (i != last);
+                       bool switchnext = (i != last) ||
+                                       (vertex_regid != regid(63, 0)) ||
+                                       (instance_regid != regid(63, 0)) ||
+                                       (vtxcnt_regid != regid(63, 0));
+                       bool isint = util_format_is_pure_integer(pfmt);
                        uint32_t fs = util_format_get_blocksize(pfmt);
                        uint32_t off = vb->buffer_offset + elem->src_offset;
                        uint32_t size = fd_bo_size(rsc->bo) - off;
@@ -309,10 +296,11 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
                        OUT_PKT0(ring, REG_A4XX_VFD_FETCH(j), 4);
                        OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) |
                                        A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) |
+                                       COND(elem->instance_divisor, A4XX_VFD_FETCH_INSTR_0_INSTANCED) |
                                        COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT));
                        OUT_RELOC(ring, rsc->bo, off, 0, 0);
                        OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(size));
-                       OUT_RING(ring, 0x00000001);
+                       OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE(MAX2(1, elem->instance_divisor)));
 
                        OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(j), 1);
                        OUT_RING(ring, A4XX_VFD_DECODE_INSTR_CONSTFILL |
@@ -322,6 +310,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
                                        A4XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) |
                                        A4XX_VFD_DECODE_INSTR_SHIFTCNT(fs) |
                                        A4XX_VFD_DECODE_INSTR_LASTCOMPVALID |
+                                       COND(isint, A4XX_VFD_DECODE_INSTR_INT) |
                                        COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT));
 
                        total_in += vp->inputs[i].ncomp;
@@ -335,10 +324,10 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
                        A4XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) |
                        A4XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j));
        OUT_RING(ring, A4XX_VFD_CONTROL_1_MAXSTORAGE(129) | // XXX
-                       A4XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
-                       A4XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
+                       A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
+                       A4XX_VFD_CONTROL_1_REGID4INST(instance_regid));
        OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_2 */
-       OUT_RING(ring, 0x0000fc00);   /* XXX VFD_CONTROL_3 */
+       OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid));
        OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_4 */
 
        /* cache invalidate, otherwise vertex fetch could see
@@ -436,10 +425,15 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
         * when it changes.
         */
        if (emit->info) {
+               const struct pipe_draw_info *info = emit->info;
                uint32_t val = fd4_rasterizer_stateobj(ctx->rasterizer)
                                ->pc_prim_vtx_cntl;
 
+               if (info->indexed && info->primitive_restart)
+                       val |= A4XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART;
+
                val |= COND(vp->writes_psize, A4XX_PC_PRIM_VTX_CNTL_PSIZE);
+
                if (fp->total_in > 0) {
                        uint32_t varout = align(fp->total_in, 16) / 16;
                        if (varout > 1)
@@ -481,25 +475,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
        if (dirty & FD_DIRTY_PROG)
                fd4_program_emit(ring, emit);
 
-       if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-                       /* evil hack to deal sanely with clear path: */
-                       (emit->prog == &ctx->prog)) {
-               fd_wfi(ctx, ring);
-               emit_constants(ring,  SB_VERT_SHADER,
-                               &ctx->constbuf[PIPE_SHADER_VERTEX],
-                               (emit->prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
-               if (!emit->key.binning_pass) {
-                       emit_constants(ring, SB_FRAG_SHADER,
-                                       &ctx->constbuf[PIPE_SHADER_FRAGMENT],
-                                       (emit->prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
-               }
+       if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+               ir3_emit_consts(vp, ring, emit->info, dirty);
+               if (!emit->key.binning_pass)
+                       ir3_emit_consts(fp, ring, emit->info, dirty);
+               /* mark clean after emitting consts: */
+               ctx->prog.dirty = 0;
        }
 
        if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
                struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
                uint32_t i;
 
-               for (i = 0; i < 8; i++) {
+               for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
                        OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
                        OUT_RING(ring, blend->rb_mrt[i].control);
 
@@ -557,10 +545,10 @@ fd4_emit_restore(struct fd_context *ctx)
        OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1);
        OUT_RING(ring, 0x00000000);
 
-       OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC3, 1);
+       OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1);
        OUT_RING(ring, 0x00000006);
 
-       OUT_PKT0(ring, REG_A4XX_UNKNOWN_0F03, 1);
+       OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1);
        OUT_RING(ring, 0x0000003a);
 
        OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1);
@@ -579,7 +567,7 @@ fd4_emit_restore(struct fd_context *ctx)
        OUT_RING(ring, 0x00000000);
        OUT_RING(ring, 0x00000012);
 
-       OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E05, 1);
+       OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1);
        OUT_RING(ring, 0x00000000);
 
        OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1);
@@ -658,11 +646,14 @@ fd4_emit_restore(struct fd_context *ctx)
        OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_OFFSET, 1);
        OUT_RING(ring, 0x00000000);
 
-       OUT_PKT0(ring, REG_A4XX_UNKNOWN_2381, 1);
-       OUT_RING(ring, 0x00000010);
+       OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_COUNT, 1);
+       OUT_RING(ring, A4XX_TPL1_TP_TEX_COUNT_VS(16) |
+                       A4XX_TPL1_TP_TEX_COUNT_HS(0) |
+                       A4XX_TPL1_TP_TEX_COUNT_DS(0) |
+                       A4XX_TPL1_TP_TEX_COUNT_GS(0));
 
-       OUT_PKT0(ring, REG_A4XX_UNKNOWN_23A0, 1);
-       OUT_RING(ring, 0x00000010);
+       OUT_PKT0(ring, REG_A4XX_TPL1_TP_FS_TEX_COUNT, 1);
+       OUT_RING(ring, 16);
 
        /* we don't use this yet.. probably best to disable.. */
        OUT_PKT3(ring, CP_SET_DRAW_STATE, 2);
@@ -699,8 +690,8 @@ fd4_emit_restore(struct fd_context *ctx)
        OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1);
        OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 
-       OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL3, 1);
-       OUT_RING(ring, A4XX_RB_RENDER_CONTROL3_COMPONENT_ENABLE(0xf));
+       OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+       OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(0xf));
 
        OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
        OUT_RING(ring, A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR);
@@ -710,3 +701,11 @@ fd4_emit_restore(struct fd_context *ctx)
 
        ctx->needs_rb_fbd = true;
 }
+
+void
+fd4_emit_init(struct pipe_context *pctx)
+{
+       struct fd_context *ctx = fd_context(pctx);
+       ctx->emit_const = fd4_emit_const;
+       ctx->emit_const_bo = fd4_emit_const_bo;
+}