freedreno/ir3: split out shader compiler from a3xx
[mesa.git] / src / gallium / drivers / freedreno / a3xx / fd3_emit.c
index 825656ae62b14ba498b0237780e94d694386f764..44932dc241dc9f04775acc9d88f3a5e34601ac93 100644 (file)
@@ -64,15 +64,6 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
                src = SS_DIRECT;
        }
 
-       /* we have this sometimes, not others.. perhaps we could be clever
-        * and figure out actually when we need to invalidate cache:
-        */
-       OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
-       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
-       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
-                       A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
-                       A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
-
        OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
        OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
                        CP_LOAD_STATE_0_STATE_SRC(src) |
@@ -96,9 +87,10 @@ static void
 emit_constants(struct fd_ringbuffer *ring,
                enum adreno_state_block sb,
                struct fd_constbuf_stateobj *constbuf,
-               struct fd3_shader_stateobj *shader)
+               struct ir3_shader_variant *shader)
 {
        uint32_t enabled_mask = constbuf->enabled_mask;
+       uint32_t first_immediate;
        uint32_t base = 0;
        unsigned i;
 
@@ -106,6 +98,13 @@ emit_constants(struct fd_ringbuffer *ring,
        // they are clobbered by a clear, gmem2mem, or mem2gmem..
        constbuf->dirty_mask = enabled_mask;
 
+       /* in particular, with binning shader and a unneeded consts no
+        * longer referenced, we could end up w/ constlen that is smaller
+        * than first_immediate.  In that case truncate the user consts
+        * early to avoid HLSQ lockup caused by writing too many consts
+        */
+       first_immediate = MIN2(shader->first_immediate, shader->constlen);
+
        /* emit user constants: */
        while (enabled_mask) {
                unsigned index = ffs(enabled_mask) - 1;
@@ -115,15 +114,17 @@ emit_constants(struct fd_ringbuffer *ring,
                // I expect that size should be a multiple of vec4's:
                assert(size == align(size, 4));
 
-               /* gallium could have const-buffer still bound, even though the
-                * shader is not using it.  Writing consts above constlen (or
-                * rather, HLSQ_{VS,FS}_CONTROL_REG.CONSTLENGTH) will cause a
-                * hang.
+               /* gallium could leave const buffers bound above what the
+                * current shader uses.. don't let that confuse us.
                 */
-               if ((base / 4) >= shader->constlen)
+               if (base >= (4 * first_immediate))
                        break;
 
                if (constbuf->dirty_mask & (1 << index)) {
+                       /* and even if the start of the const buffer is before
+                        * first_immediate, the end may not be:
+                        */
+                       size = MIN2(size, (4 * first_immediate) - base);
                        fd3_emit_constant(ring, sb, base,
                                        cb->buffer_offset, size,
                                        cb->user_buffer, cb->buffer);
@@ -137,9 +138,11 @@ emit_constants(struct fd_ringbuffer *ring,
        /* emit shader immediates: */
        if (shader) {
                for (i = 0; i < shader->immediates_count; i++) {
-                       fd3_emit_constant(ring, sb,
-                                       4 * (shader->first_immediate + i),
-                                       0, 4, shader->immediates[i].val, NULL);
+                       base = 4 * (shader->first_immediate + i);
+                       if (base >= (4 * shader->constlen))
+                               break;
+                       fd3_emit_constant(ring, sb, base,
+                               0, 4, shader->immediates[i].val, NULL);
                }
        }
 }
@@ -173,8 +176,10 @@ emit_textures(struct fd_ringbuffer *ring,
                OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
                                CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
                for (i = 0; i < tex->num_samplers; i++) {
-                       struct fd3_sampler_stateobj *sampler =
-                                       fd3_sampler_stateobj(tex->samplers[i]);
+                       static const struct fd3_sampler_stateobj dummy_sampler = {};
+                       const struct fd3_sampler_stateobj *sampler = tex->samplers[i] ?
+                                       fd3_sampler_stateobj(tex->samplers[i]) :
+                                       &dummy_sampler;
                        OUT_RING(ring, sampler->texsamp0);
                        OUT_RING(ring, sampler->texsamp1);
                }
@@ -190,8 +195,10 @@ emit_textures(struct fd_ringbuffer *ring,
                OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
                                CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
                for (i = 0; i < tex->num_textures; i++) {
-                       struct fd3_pipe_sampler_view *view =
-                                       fd3_pipe_sampler_view(tex->textures[i]);
+                       static const struct fd3_pipe_sampler_view dummy_view = {};
+                       const struct fd3_pipe_sampler_view *view = tex->textures[i] ?
+                                       fd3_pipe_sampler_view(tex->textures[i]) :
+                                       &dummy_view;
                        OUT_RING(ring, view->texconst0);
                        OUT_RING(ring, view->texconst1);
                        OUT_RING(ring, view->texconst2 |
@@ -208,8 +215,10 @@ emit_textures(struct fd_ringbuffer *ring,
                OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
                                CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
                for (i = 0; i < tex->num_textures; i++) {
-                       struct fd3_pipe_sampler_view *view =
-                                       fd3_pipe_sampler_view(tex->textures[i]);
+                       static const struct fd3_pipe_sampler_view dummy_view = {};
+                       const struct fd3_pipe_sampler_view *view = tex->textures[i] ?
+                                       fd3_pipe_sampler_view(tex->textures[i]) :
+                                       &dummy_view;
                        struct fd_resource *rsc = view->tex_resource;
 
                        for (j = 0; j < view->mipaddrs; j++) {
@@ -225,28 +234,6 @@ emit_textures(struct fd_ringbuffer *ring,
        }
 }
 
-static void
-emit_cache_flush(struct fd_ringbuffer *ring)
-{
-       OUT_PKT3(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, CACHE_FLUSH);
-
-       /* probably only really needed on a320: */
-       OUT_PKT3(ring, CP_DRAW_INDX, 3);
-       OUT_RING(ring, 0x00000000);
-       OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
-                       INDEX_SIZE_IGN, IGNORE_VISIBILITY));
-       OUT_RING(ring, 0);                                      /* NumIndices */
-
-       OUT_PKT3(ring, CP_NOP, 4);
-       OUT_RING(ring, 0x00000000);
-       OUT_RING(ring, 0x00000000);
-       OUT_RING(ring, 0x00000000);
-       OUT_RING(ring, 0x00000000);
-
-       OUT_WFI (ring);
-}
-
 /* emit texture state for mem->gmem restore operation.. eventually it would
  * be good to get rid of this and use normal CSO/etc state for more of these
  * special cases, but for now the compiler is not sufficient..
@@ -304,44 +291,72 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf
 
 void
 fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog,
+               struct ir3_shader_variant *vp,
                struct fd3_vertex_buf *vbufs, uint32_t n)
 {
-       struct fd3_shader_stateobj *vp = prog->vp;
-       uint32_t i;
+       uint32_t i, j, last = 0;
+       uint32_t total_in = 0;
 
        n = MIN2(n, vp->inputs_count);
 
-       for (i = 0; i < n; i++) {
-               struct pipe_resource *prsc = vbufs[i].prsc;
-               struct fd_resource *rsc = fd_resource(prsc);
-               enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(vbufs[i].format);
-               bool switchnext = (i != (n - 1));
-               uint32_t fs = util_format_get_blocksize(vbufs[i].format);
-
-               OUT_PKT0(ring, REG_A3XX_VFD_FETCH(i), 2);
-               OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) |
-                               A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vbufs[i].stride) |
-                               COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) |
-                               A3XX_VFD_FETCH_INSTR_0_INDEXCODE(i) |
-                               A3XX_VFD_FETCH_INSTR_0_STEPRATE(1));
-               OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 0, 0);
-
-               OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(i), 1);
-               OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL |
-                               A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) |
-                               A3XX_VFD_DECODE_INSTR_FORMAT(fmt) |
-                               A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) |
-                               A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) |
-                               A3XX_VFD_DECODE_INSTR_LASTCOMPVALID |
-                               COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT));
+       for (i = 0; i < n; i++)
+               if (vp->inputs[i].compmask)
+                       last = i;
+
+       for (i = 0, j = 0; i <= last; i++) {
+               if (vp->inputs[i].compmask) {
+                       struct pipe_resource *prsc = vbufs[i].prsc;
+                       struct fd_resource *rsc = fd_resource(prsc);
+                       enum pipe_format pfmt = vbufs[i].format;
+                       enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
+                       bool switchnext = (i != last);
+                       uint32_t fs = util_format_get_blocksize(pfmt);
+
+                       debug_assert(fmt != ~0);
+
+                       OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2);
+                       OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) |
+                                       A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vbufs[i].stride) |
+                                       COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) |
+                                       A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) |
+                                       A3XX_VFD_FETCH_INSTR_0_STEPRATE(1));
+                       OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 0, 0);
+
+                       OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1);
+                       OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL |
+                                       A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) |
+                                       A3XX_VFD_DECODE_INSTR_FORMAT(fmt) |
+                                       A3XX_VFD_DECODE_INSTR_SWAP(fd3_pipe2swap(pfmt)) |
+                                       A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) |
+                                       A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) |
+                                       A3XX_VFD_DECODE_INSTR_LASTCOMPVALID |
+                                       COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT));
+
+                       total_in += vp->inputs[i].ncomp;
+                       j++;
+               }
        }
+
+       OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
+       OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) |
+                       A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
+                       A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) |
+                       A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j));
+       OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
+                       A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
+                       A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
 }
 
 void
-fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
+fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               struct fd_program_stateobj *prog, uint32_t dirty,
+               struct ir3_shader_key key)
 {
-       struct fd_ringbuffer *ring = ctx->ring;
+       struct ir3_shader_variant *vp;
+       struct ir3_shader_variant *fp;
+
+       fp = fd3_shader_variant(prog->fp, key);
+       vp = fd3_shader_variant(prog->vp, key);
 
        emit_marker(ring, 5);
 
@@ -352,18 +367,32 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                                A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(ctx->sample_mask));
        }
 
+       if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !key.binning_pass) {
+               uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_render_control;
+
+               val |= COND(fp->frag_face, A3XX_RB_RENDER_CONTROL_FACENESS);
+               val |= COND(fp->frag_coord, A3XX_RB_RENDER_CONTROL_XCOORD |
+                               A3XX_RB_RENDER_CONTROL_YCOORD |
+                               A3XX_RB_RENDER_CONTROL_ZCOORD |
+                               A3XX_RB_RENDER_CONTROL_WCOORD);
+
+               /* I suppose if we needed to (which I don't *think* we need
+                * to), we could emit this for binning pass too.  But we
+                * would need to keep a different patch-list for binning
+                * vs render pass.
+                */
+
+               OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1);
+               OUT_RINGP(ring, val, &fd3_context(ctx)->rbrc_patches);
+       }
+
        if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) {
                struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa);
                struct pipe_stencil_ref *sr = &ctx->stencil_ref;
 
-               fd3_emit_rbrc_draw_state(ring, zsa->rb_render_control);
-
                OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1);
                OUT_RING(ring, zsa->rb_alpha_ref);
 
-               OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
-               OUT_RING(ring, zsa->rb_depth_control);
-
                OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1);
                OUT_RING(ring, zsa->rb_stencil_control);
 
@@ -374,6 +403,16 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                                A3XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1]));
        }
 
+       if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) {
+               uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_depth_control;
+               if (fp->writes_pos) {
+                       val |= A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z;
+                       val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE;
+               }
+               OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
+               OUT_RING(ring, val);
+       }
+
        if (dirty & FD_DIRTY_RASTERIZER) {
                struct fd3_rasterizer_stateobj *rasterizer =
                                fd3_rasterizer_stateobj(ctx->rasterizer);
@@ -388,24 +427,33 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                OUT_PKT0(ring, REG_A3XX_GRAS_SU_POLY_OFFSET_SCALE, 2);
                OUT_RING(ring, rasterizer->gras_su_poly_offset_scale);
                OUT_RING(ring, rasterizer->gras_su_poly_offset_offset);
+       }
 
+       if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
+               uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer)
+                               ->gras_cl_clip_cntl;
+               val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE);
+               val |= COND(fp->frag_coord, A3XX_GRAS_CL_CLIP_CNTL_ZCOORD |
+                               A3XX_GRAS_CL_CLIP_CNTL_WCOORD);
                OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
-               OUT_RING(ring, rasterizer->gras_cl_clip_cntl);
+               OUT_RING(ring, val);
        }
 
        if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
-               struct fd3_rasterizer_stateobj *rasterizer =
-                               fd3_rasterizer_stateobj(ctx->rasterizer);
-               struct fd3_shader_stateobj *fp = ctx->prog.fp;
-               uint32_t stride_in_vpc;
+               uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer)
+                               ->pc_prim_vtx_cntl;
+
+               if (!key.binning_pass) {
+                       uint32_t stride_in_vpc = align(fp->total_in, 4) / 4;
+                       if (stride_in_vpc > 0)
+                               stride_in_vpc = MAX2(stride_in_vpc, 2);
+                       val |= A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc);
+               }
 
-               stride_in_vpc = align(fp->total_in, 4) / 4;
-               if (stride_in_vpc > 0)
-                       stride_in_vpc = MAX2(stride_in_vpc, 2);
+               val |= COND(vp->writes_psize, A3XX_PC_PRIM_VTX_CNTL_PSIZE);
 
                OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
-               OUT_RING(ring, rasterizer->pc_prim_vtx_cntl |
-                               A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc));
+               OUT_RING(ring, val);
        }
 
        if (dirty & FD_DIRTY_SCISSOR) {
@@ -424,6 +472,7 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
        }
 
        if (dirty & FD_DIRTY_VIEWPORT) {
+               fd_wfi(ctx, ring);
                OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6);
                OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(ctx->viewport.translate[0] - 0.5));
                OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(ctx->viewport.scale[0]));
@@ -433,21 +482,30 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(ctx->viewport.scale[2]));
        }
 
-       if (dirty & FD_DIRTY_PROG)
-               fd3_program_emit(ring, &ctx->prog);
+       if (dirty & FD_DIRTY_PROG) {
+               fd3_program_emit(ring, prog, key);
+       }
 
-       if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
-               struct fd_program_stateobj *prog = &ctx->prog;
+       /* TODO we should not need this or fd_wfi() before emit_constants():
+        */
+       OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, HLSQ_FLUSH);
 
+       if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
+                       /* evil hack to deal sanely with clear path: */
+                       (prog == &ctx->prog)) {
+               fd_wfi(ctx, ring);
                emit_constants(ring,  SB_VERT_SHADER,
                                &ctx->constbuf[PIPE_SHADER_VERTEX],
-                               (prog->dirty & FD_SHADER_DIRTY_VP) ? prog->vp : NULL);
-               emit_constants(ring, SB_FRAG_SHADER,
-                               &ctx->constbuf[PIPE_SHADER_FRAGMENT],
-                               (prog->dirty & FD_SHADER_DIRTY_FP) ? prog->fp : NULL);
+                               (prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
+               if (!key.binning_pass) {
+                       emit_constants(ring, SB_FRAG_SHADER,
+                                       &ctx->constbuf[PIPE_SHADER_FRAGMENT],
+                                       (prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+               }
        }
 
-       if (dirty & FD_DIRTY_BLEND) {
+       if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
                struct fd3_blend_stateobj *blend = fd3_blend_stateobj(ctx->blend);
                uint32_t i;
 
@@ -473,11 +531,22 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                                A3XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
        }
 
-       if (dirty & FD_DIRTY_VERTTEX)
-               emit_textures(ring, SB_VERT_TEX, &ctx->verttex);
+       if (dirty & (FD_DIRTY_VERTTEX | FD_DIRTY_FRAGTEX))
+               fd_wfi(ctx, ring);
 
-       if (dirty & FD_DIRTY_FRAGTEX)
-               emit_textures(ring, SB_FRAG_TEX, &ctx->fragtex);
+       if (dirty & FD_DIRTY_VERTTEX) {
+               if (vp->has_samp)
+                       emit_textures(ring, SB_VERT_TEX, &ctx->verttex);
+               else
+                       dirty &= ~FD_DIRTY_VERTTEX;
+       }
+
+       if (dirty & FD_DIRTY_FRAGTEX) {
+               if (fp->has_samp)
+                       emit_textures(ring, SB_FRAG_TEX, &ctx->fragtex);
+               else
+                       dirty &= ~FD_DIRTY_FRAGTEX;
+       }
 
        ctx->dirty &= ~dirty;
 }
@@ -499,6 +568,7 @@ fd3_emit_restore(struct fd_context *ctx)
                OUT_RING(ring, 0x00000000);
        }
 
+       fd_wfi(ctx, ring);
        OUT_PKT3(ring, CP_INVALIDATE_STATE, 1);
        OUT_RING(ring, 0x00007fff);
 
@@ -568,11 +638,11 @@ fd3_emit_restore(struct fd_context *ctx)
        OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) |
                        A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0));
 
-       OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1);
-       OUT_RING(ring, 0x00000001);        /* UCHE_CACHE_MODE_CONTROL_REG */
-
-       OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1);
-       OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */
+       OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
+       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
+                       A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
+                       A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
 
        OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
        OUT_RING(ring, 0x00000000);                  /* GRAS_CL_CLIP_CNTL */
@@ -606,5 +676,26 @@ fd3_emit_restore(struct fd_context *ctx)
                OUT_RING(ring, 0x00000000);    /* GRAS_CL_USER_PLANE[i].W */
        }
 
-       emit_cache_flush(ring);
+       OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+       OUT_RING(ring, 0x00000000);
+
+       fd_event_write(ctx, ring, CACHE_FLUSH);
+
+       if (is_a3xx_p0(ctx->screen)) {
+               OUT_PKT3(ring, CP_DRAW_INDX, 3);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
+                               INDEX_SIZE_IGN, IGNORE_VISIBILITY));
+               OUT_RING(ring, 0);                                      /* NumIndices */
+       }
+
+       OUT_PKT3(ring, CP_NOP, 4);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+
+       fd_wfi(ctx, ring);
+
+       ctx->needs_rb_fbd = true;
 }