freedreno: Rename vp and fp to vs and fs in fd_program_stateobj
[mesa.git] / src / gallium / drivers / freedreno / a2xx / fd2_draw.c
index f7915d66c1aa14f7321d8b5f91a43daa067e151e..938c0ba9d34c500586358e192be64ed76bf59532 100644 (file)
@@ -1,5 +1,3 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
 /*
  * Copyright (C) 2012-2013 Rob Clark <robclark@freedesktop.org>
  *
@@ -77,156 +75,177 @@ emit_vertexbufs(struct fd_context *ctx)
        // CONST(20,0) (or CONST(26,0) in soliv_vp)
 
        fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements);
+       fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements);
 }
 
-static bool
-fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
-             unsigned index_offset)
+static void
+draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
+                  struct fd_ringbuffer *ring, unsigned index_offset, bool binning)
 {
-       struct fd_ringbuffer *ring = ctx->batch->draw;
-
-       if (ctx->dirty & FD_DIRTY_VTXBUF)
-               emit_vertexbufs(ctx);
-
-       fd2_emit_state(ctx, ctx->dirty);
-
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
-       OUT_RING(ring, info->start);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
-       OUT_RING(ring, 0x0000003b);
+       OUT_RING(ring, info->index_size ? 0 : info->start);
 
        OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
        OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
 
-       OUT_WFI (ring);
+       if (is_a20x(ctx->screen)) {
+               /* wait for DMA to finish and
+                * dummy draw one triangle with indexes 0,0,0.
+                * with PRE_FETCH_CULL_ENABLE | GRP_CULL_ENABLE.
+                *
+                * this workaround is for a HW bug related to DMA alignment:
+                * it is necessary for indexed draws and possibly also
+                * draws that read binning data
+                */
+               OUT_PKT3(ring, CP_WAIT_REG_EQ, 4);
+               OUT_RING(ring, 0x000005d0); /* RBBM_STATUS */
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00001000); /* bit: 12: VGT_BUSY_NO_DMA */
+               OUT_RING(ring, 0x00000001);
+
+               OUT_PKT3(ring, CP_DRAW_INDX_BIN, 6);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x0003c004);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000003);
+               OUT_RELOC(ring, fd_resource(fd2_context(ctx)->solid_vertexbuf)->bo, 64, 0, 0);
+               OUT_RING(ring, 0x00000006);
+       } else {
+               OUT_WFI (ring);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-       OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-       OUT_RING(ring, info->max_index);        /* VGT_MAX_VTX_INDX */
-       OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
+               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+               OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+               OUT_RING(ring, info->max_index);        /* VGT_MAX_VTX_INDX */
+               OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
+       }
+
+       /* binning shader will take offset from C64 */
+       if (binning && is_a20x(ctx->screen)) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+               OUT_RING(ring, 0x00000180);
+               OUT_RING(ring, fui(ctx->batch->num_vertices));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(0.0f));
+       }
+
+       enum pc_di_vis_cull_mode vismode = USE_VISIBILITY;
+       if (binning || info->mode == PIPE_PRIM_POINTS)
+               vismode = IGNORE_VISIBILITY;
 
        fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode],
-                                IGNORE_VISIBILITY, info, index_offset);
+                                vismode, info, index_offset);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010));
-       OUT_RING(ring, 0x00000000);
+       if (is_a20x(ctx->screen)) {
+               /* not sure why this is required, but it fixes some hangs */
+               OUT_WFI(ring);
+       } else {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010));
+               OUT_RING(ring, 0x00000000);
+       }
 
        emit_cacheflush(ring);
+}
+
+
+static bool
+fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo,
+                        unsigned index_offset)
+{
+       if (!ctx->prog.fs || !ctx->prog.vs)
+               return false;
+
+       if (ctx->dirty & FD_DIRTY_VTXBUF)
+               emit_vertexbufs(ctx);
+
+       if (fd_binning_enabled)
+               fd2_emit_state_binning(ctx, ctx->dirty);
+
+       fd2_emit_state(ctx, ctx->dirty);
+
+       /* a2xx can draw only 65535 vertices at once
+        * on a22x the field in the draw command is 32bits but seems limited too
+        * using a limit of 32k because it fixes an unexplained hang
+        * 32766 works for all primitives (multiple of 2 and 3)
+        */
+       if (pinfo->count > 32766) {
+               static const uint16_t step_tbl[PIPE_PRIM_MAX] = {
+                       [0 ... PIPE_PRIM_MAX - 1]  = 32766,
+                       [PIPE_PRIM_LINE_STRIP]     = 32765,
+                       [PIPE_PRIM_TRIANGLE_STRIP] = 32764,
+
+                       /* needs more work */
+                       [PIPE_PRIM_TRIANGLE_FAN]   = 0,
+                       [PIPE_PRIM_LINE_LOOP]      = 0,
+               };
+
+               struct pipe_draw_info info = *pinfo;
+               unsigned count = info.count;
+               unsigned step = step_tbl[info.mode];
+               unsigned num_vertices = ctx->batch->num_vertices;
+
+               if (!step)
+                       return false;
+
+               for (; count + step > 32766; count -= step) {
+                       info.count = MIN2(count, 32766);
+                       draw_impl(ctx, &info, ctx->batch->draw, index_offset, false);
+                       draw_impl(ctx, &info, ctx->batch->binning, index_offset, true);
+                       info.start += step;
+                       ctx->batch->num_vertices += step;
+               }
+               /* changing this value is a hack, restore it */
+               ctx->batch->num_vertices = num_vertices;
+       } else {
+               draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false);
+               draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true);
+       }
 
        fd_context_all_clean(ctx);
 
        return true;
 }
 
-
 static void
-fd2_clear(struct fd_context *ctx, unsigned buffers,
-               const union pipe_color_union *color, double depth, unsigned stencil)
+clear_state(struct fd_batch *batch, struct fd_ringbuffer *ring,
+       unsigned buffers, bool fast_clear)
 {
+       struct fd_context *ctx = batch->ctx;
        struct fd2_context *fd2_ctx = fd2_context(ctx);
-       struct fd_ringbuffer *ring = ctx->batch->draw;
-       struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer;
-       uint32_t reg, colr = 0;
-
-       if ((buffers & PIPE_CLEAR_COLOR) && fb->nr_cbufs)
-               colr  = pack_rgba(fb->cbufs[0]->format, color->f);
-
-       /* emit generic state now: */
-       fd2_emit_state(ctx, ctx->dirty &
-                       (FD_DIRTY_BLEND | FD_DIRTY_VIEWPORT |
-                                       FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR));
+       uint32_t reg;
 
        fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) {
-                       { .prsc = fd2_ctx->solid_vertexbuf, .size = 48 },
+                       { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 },
                }, 1);
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
        OUT_RING(ring, 0);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
-       OUT_RING(ring, 0x0000028f);
-
-       fd2_program_emit(ring, &ctx->solid_prog);
+       fd2_program_emit(ctx, ring, &ctx->solid_prog);
 
        OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
        OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
-       OUT_RING(ring, colr);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
-       OUT_RING(ring, 0x00000084);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
-       reg = 0;
-       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-               reg |= A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE;
-               switch (fd_pipe2depth(fb->zsbuf->format)) {
-               case DEPTHX_24_8:
-                       if (buffers & PIPE_CLEAR_DEPTH)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xe);
-                       if (buffers & PIPE_CLEAR_STENCIL)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0x1);
-                       break;
-               case DEPTHX_16:
-                       if (buffers & PIPE_CLEAR_DEPTH)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf);
-                       break;
-               default:
-                       debug_assert(0);
-                       break;
-               }
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
-       reg = 0;
        if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-               switch (fd_pipe2depth(fb->zsbuf->format)) {
-               case DEPTHX_24_8:
-                       reg = (((uint32_t)(0xffffff * depth)) << 8) |
-                               (stencil & 0xff);
-                       break;
-               case DEPTHX_16:
-                       reg = (uint32_t)(0xffffffff * depth);
-                       break;
-               default:
-                       debug_assert(0);
-                       break;
-               }
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL));
-       reg = 0;
-       if (buffers & PIPE_CLEAR_DEPTH) {
-               reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) |
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL));
+               reg = 0;
+               if (buffers & PIPE_CLEAR_DEPTH) {
+                       reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) |
                                A2XX_RB_DEPTHCONTROL_Z_ENABLE |
                                A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE |
                                A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE;
+               }
+               if (buffers & PIPE_CLEAR_STENCIL) {
+                       reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) |
+                                       A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE |
+                                       A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE);
+               }
+               OUT_RING(ring, reg);
        }
-       if (buffers & PIPE_CLEAR_STENCIL) {
-               reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) |
-                               A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE |
-                               A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE);
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
-       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
-       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL));
@@ -241,18 +260,19 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
        OUT_RING(ring, 0x00000000);        /* PA_CL_CLIP_CNTL */
        OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST |  /* PA_SU_SC_MODE_CNTL */
                        A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
-                       A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES));
+                       A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+                       (fast_clear ? A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE : 0));
+
+       if (fast_clear) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG));
+               OUT_RING(ring, A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(3));
+       }
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK));
        OUT_RING(ring, 0x0000ffff);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
-       OUT_RING(ring, xy2d(0,0));              /* PA_SC_WINDOW_SCISSOR_TL */
-       OUT_RING(ring, xy2d(fb->width,      /* PA_SC_WINDOW_SCISSOR_BR */
-                       fb->height));
-
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK));
        if (buffers & PIPE_CLEAR_COLOR) {
@@ -264,22 +284,327 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
                OUT_RING(ring, 0x0);
        }
 
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL));
+       OUT_RING(ring, 0);
+
+       if (is_a20x(batch->ctx->screen))
+               return;
+
        OUT_PKT3(ring, CP_SET_CONSTANT, 3);
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
        OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
        OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
 
-       fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
-                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
-       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000084);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+       OUT_RING(ring, 0x0000028f);
+}
+
+static void
+clear_state_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+       if (is_a20x(ctx->screen))
+               return;
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
        OUT_RING(ring, 0x00000000);
 
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
+       OUT_RING(ring, 0x00000000);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+       OUT_RING(ring, 0x0000003b);
+}
+
+static void
+clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
+       uint32_t color_clear, uint32_t depth_clear, unsigned patch_type)
+{
+       BEGIN_RING(ring, 8); /* preallocate next 2 packets (for patching) */
+
+       /* zero values are patched in */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
+       OUT_RINGP(ring, patch_type, &batch->gmem_patches);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 4);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
+       OUT_RING(ring, 0x8000 | 32);
+       OUT_RING(ring, 0);
+       OUT_RING(ring, 0);
+
+       /* set fill values */
+       if (!is_a20x(batch->ctx->screen)) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
+               OUT_RING(ring, color_clear);
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
+               OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE |
+                       A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
+               OUT_RING(ring, depth_clear);
+       } else {
+               const float sc = 1.0f / 255.0f;
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+               OUT_RING(ring, 0x00000480);
+               OUT_RING(ring, fui((float) (color_clear >>  0 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >>  8 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >> 16 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >> 24 & 0xff) * sc));
+
+               // XXX if using float the rounding error breaks it..
+               float depth = ((double) (depth_clear >> 8)) * (1.0/(double) 0xffffff);
+               assert((unsigned) (((double) depth * (double) 0xffffff)) ==
+                       (depth_clear >> 8));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+               OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(depth));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+               OUT_RING(ring, 0xff000000 |
+                       A2XX_RB_STENCILREFMASK_BF_STENCILREF(depth_clear & 0xff) |
+                       A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+               OUT_RING(ring, 0xff000000 |
+                       A2XX_RB_STENCILREFMASK_STENCILREF(depth_clear & 0xff) |
+                       A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
+       }
+
+       fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+}
+
+static bool
+fd2_clear_fast(struct fd_context *ctx, unsigned buffers,
+               const union pipe_color_union *color, double depth, unsigned stencil)
+{
+       /* using 4x MSAA allows clearing ~2x faster
+        * then we can use higher bpp clearing to clear lower bpp
+        * 1 "pixel" can clear 64 bits (rgba8+depth24+stencil8)
+        * note: its possible to clear with 32_32_32_32 format but its not faster
+        * note: fast clear doesn't work with sysmem rendering
+        * (sysmem rendering is disabled when clear is used)
+        *
+        * we only have 16-bit / 32-bit color formats
+        * and 16-bit / 32-bit depth formats
+        * so there are only a few possible combinations
+        *
+        * if the bpp of the color/depth doesn't match
+        * we clear with depth/color individually
+        */
+       struct fd2_context *fd2_ctx = fd2_context(ctx);
+       struct fd_batch *batch = ctx->batch;
+       struct fd_ringbuffer *ring = batch->draw;
+       struct pipe_framebuffer_state *pfb = &batch->framebuffer;
+       uint32_t color_clear = 0, depth_clear = 0;
+       enum pipe_format format = pipe_surface_format(pfb->cbufs[0]);
+       int depth_size = -1; /* -1: no clear, 0: clear 16-bit, 1: clear 32-bit */
+       int color_size = -1;
+
+       /* TODO: need to test performance on a22x */
+       if (!is_a20x(ctx->screen))
+               return false;
+
+       if (buffers & PIPE_CLEAR_COLOR)
+               color_size = util_format_get_blocksizebits(format) == 32;
+
+       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+               /* no fast clear when clearing only one component of depth+stencil buffer */
+               if (!(buffers & PIPE_CLEAR_DEPTH))
+                       return false;
+
+               if ((pfb->zsbuf->format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+                        pfb->zsbuf->format == PIPE_FORMAT_S8_UINT_Z24_UNORM) &&
+                        !(buffers & PIPE_CLEAR_STENCIL))
+                       return false;
+
+               depth_size = fd_pipe2depth(pfb->zsbuf->format) == DEPTHX_24_8;
+       }
+
+       assert(color_size >= 0 || depth_size >= 0);
+
+       if (color_size == 0) {
+               color_clear = pack_rgba(format, color->f);
+               color_clear = (color_clear << 16) | (color_clear & 0xffff);
+       } else if (color_size == 1) {
+               color_clear = pack_rgba(format, color->f);
+       }
+
+       if (depth_size == 0) {
+               depth_clear = (uint32_t)(0xffff * depth);
+               depth_clear |= depth_clear << 16;
+       } else if (depth_size == 1) {
+               depth_clear = (((uint32_t)(0xffffff * depth)) << 8);
+               depth_clear |= (stencil & 0xff);
+       }
+
+       /* disable "window" scissor.. */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
+       OUT_RING(ring, xy2d(0, 0));
+       OUT_RING(ring, xy2d(0x7fff, 0x7fff));
+
+       /* make sure we fill all "pixels" (in SCREEN_SCISSOR) */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+
+       clear_state(batch, ring, ~0u, true);
+
+       if (color_size >= 0 && depth_size != color_size)
+               clear_fast(batch, ring, color_clear, color_clear, GMEM_PATCH_FASTCLEAR_COLOR);
+
+       if (depth_size >= 0 && depth_size != color_size)
+               clear_fast(batch, ring, depth_clear, depth_clear, GMEM_PATCH_FASTCLEAR_DEPTH);
+
+       if (depth_size == color_size)
+               clear_fast(batch, ring, color_clear, depth_clear, GMEM_PATCH_FASTCLEAR_COLOR_DEPTH);
+
+       clear_state_restore(ctx, ring);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG));
+       OUT_RING(ring, 0);
+
+       /* can't patch in SCREEN_SCISSOR_BR as it can be different for each tile.
+        * MEM_WRITE the value in tile_renderprep, and use CP_LOAD_CONSTANT_CONTEXT
+        * the value is read from byte offset 60 in the given bo
+        */
+       OUT_PKT3(ring, CP_LOAD_CONSTANT_CONTEXT, 3);
+       OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 0, 0, 0);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
+       OUT_RING(ring, 1);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 4);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
+       OUT_RINGP(ring, GMEM_PATCH_RESTORE_INFO, &batch->gmem_patches);
+       OUT_RING(ring, 0);
+       OUT_RING(ring, 0);
+       return true;
+}
+
+static bool
+fd2_clear(struct fd_context *ctx, unsigned buffers,
+               const union pipe_color_union *color, double depth, unsigned stencil)
+{
+       struct fd_ringbuffer *ring = ctx->batch->draw;
+       struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer;
+
+       if (fd2_clear_fast(ctx, buffers, color, depth, stencil))
+               goto dirty;
+
+       /* set clear value */
+       if (is_a20x(ctx->screen)) {
+               if (buffers & PIPE_CLEAR_COLOR) {
+                       /* C0 used by fragment shader */
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+                       OUT_RING(ring, 0x00000480);
+                       OUT_RING(ring, color->ui[0]);
+                       OUT_RING(ring, color->ui[1]);
+                       OUT_RING(ring, color->ui[2]);
+                       OUT_RING(ring, color->ui[3]);
+               }
+
+               if (buffers & PIPE_CLEAR_DEPTH) {
+                       /* use viewport to set depth value */
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+                       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE));
+                       OUT_RING(ring, fui(0.0f));
+                       OUT_RING(ring, fui(depth));
+               }
+
+               if (buffers & PIPE_CLEAR_STENCIL) {
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+                       OUT_RING(ring, 0xff000000 |
+                               A2XX_RB_STENCILREFMASK_BF_STENCILREF(stencil) |
+                               A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+                       OUT_RING(ring, 0xff000000 |
+                               A2XX_RB_STENCILREFMASK_STENCILREF(stencil) |
+                               A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
+               }
+       } else {
+               if (buffers & PIPE_CLEAR_COLOR) {
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
+                       OUT_RING(ring, pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f));
+               }
+
+               if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+                       uint32_t clear_mask, depth_clear;
+                       switch (fd_pipe2depth(fb->zsbuf->format)) {
+                       case DEPTHX_24_8:
+                               clear_mask = ((buffers & PIPE_CLEAR_DEPTH) ? 0xe : 0) |
+                                       ((buffers & PIPE_CLEAR_STENCIL) ? 0x1 : 0);
+                               depth_clear = (((uint32_t)(0xffffff * depth)) << 8) |
+                                       (stencil & 0xff);
+                               break;
+                       case DEPTHX_16:
+                               clear_mask = 0xf;
+                               depth_clear = (uint32_t)(0xffffffff * depth);
+                               break;
+                       default:
+                               unreachable("invalid depth");
+                               break;
+                       }
+
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
+                       OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE |
+                               A2XX_RB_COPY_CONTROL_CLEAR_MASK(clear_mask));
+
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
+                       OUT_RING(ring, depth_clear);
+               }
+       }
+
+       /* scissor state */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
+       OUT_RING(ring, xy2d(0, 0));
+       OUT_RING(ring, xy2d(fb->width, fb->height));
+
+       /* viewport state */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE));
+       OUT_RING(ring, fui((float) fb->width / 2.0));
+       OUT_RING(ring, fui((float) fb->width / 2.0));
+       OUT_RING(ring, fui((float) fb->height / 2.0));
+       OUT_RING(ring, fui((float) fb->height / 2.0));
+
+       /* common state */
+       clear_state(ctx->batch, ring, buffers, false);
+
+       fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+
+       clear_state_restore(ctx, ring);
+
+dirty:
        ctx->dirty |= FD_DIRTY_ZSA |
                        FD_DIRTY_VIEWPORT |
                        FD_DIRTY_RASTERIZER |
@@ -287,10 +612,13 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
                        FD_DIRTY_PROG |
                        FD_DIRTY_CONST |
                        FD_DIRTY_BLEND |
-                       FD_DIRTY_FRAMEBUFFER;
+                       FD_DIRTY_FRAMEBUFFER |
+                       FD_DIRTY_SCISSOR;
 
        ctx->dirty_shader[PIPE_SHADER_VERTEX]   |= FD_DIRTY_SHADER_PROG;
        ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST;
+
+       return true;
 }
 
 void