From: Jonathan Marek Date: Mon, 28 Jan 2019 17:49:54 +0000 (-0500) Subject: freedreno: a2xx: clear fixes and fast clear path X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=912a9c8d8cf5e7e4e05a5cb06f4284eeff7b379a freedreno: a2xx: clear fixes and fast clear path This fixes the depth/stencil clear on a20x, and adds a fast clear path. The fast clear path is only used for a20x, needs performance tests on a22x. Signed-off-by: Jonathan Marek --- diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c index 760ad17732a..28073b07011 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c @@ -54,6 +54,8 @@ create_solid_vertexbuf(struct pipe_context *pctx) +0.000000, +0.000000, +1.000000, +0.000000, +0.000000, +1.000000, + /* SCREEN_SCISSOR_BR value (must be at 60 byte offset in page) */ + 0.0, }; struct pipe_resource *prsc = pipe_buffer_create(pctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(init_shader_const)); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index c857c118d91..05c4cd5391b 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -208,23 +208,13 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, return true; } - -static bool -fd2_clear(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) +static void +clear_state(struct fd_batch *batch, struct fd_ringbuffer *ring, + unsigned buffers, bool fast_clear) { + struct fd_context *ctx = batch->ctx; struct fd2_context *fd2_ctx = fd2_context(ctx); - struct fd_ringbuffer *ring = ctx->batch->draw; - struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer; - uint32_t reg, colr = 0; - - if ((buffers & PIPE_CLEAR_COLOR) && fb->nr_cbufs) - colr = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f); - - /* emit generic state now: */ - fd2_emit_state(ctx, ctx->dirty & - (FD_DIRTY_BLEND | FD_DIRTY_VIEWPORT | - FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR)); + uint32_t reg; fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) { { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 }, @@ -234,96 +224,28 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); OUT_RING(ring, 0); - if (!is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000028f); - } - fd2_program_emit(ctx, ring, &ctx->solid_prog); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - if (is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, 0x00000480); - OUT_RING(ring, color->ui[0]); - OUT_RING(ring, color->ui[1]); - OUT_RING(ring, color->ui[2]); - OUT_RING(ring, color->ui[3]); - } else { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); - OUT_RING(ring, colr); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); - OUT_RING(ring, 0x00000084); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); - reg = 0; if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - reg |= A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE; - switch (fd_pipe2depth(fb->zsbuf->format)) { - case DEPTHX_24_8: - if (buffers & PIPE_CLEAR_DEPTH) - reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xe); - if (buffers & PIPE_CLEAR_STENCIL) - reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0x1); - break; - case DEPTHX_16: - if (buffers & PIPE_CLEAR_DEPTH) - reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf); - break; - default: - debug_assert(0); - break; - } - } - OUT_RING(ring, reg); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); - reg = 0; - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - switch (fd_pipe2depth(fb->zsbuf->format)) { - case DEPTHX_24_8: - reg = (((uint32_t)(0xffffff * depth)) << 8) | - (stencil & 0xff); - break; - case DEPTHX_16: - reg = (uint32_t)(0xffffffff * depth); - break; - default: - debug_assert(0); - break; - } - } - OUT_RING(ring, reg); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); - reg = 0; - if (buffers & PIPE_CLEAR_DEPTH) { - reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) | + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); + reg = 0; + if (buffers & PIPE_CLEAR_DEPTH) { + reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) | A2XX_RB_DEPTHCONTROL_Z_ENABLE | A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE | A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; + } + if (buffers & PIPE_CLEAR_STENCIL) { + reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) | + A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | + A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE); + } + OUT_RING(ring, reg); } - if (buffers & PIPE_CLEAR_STENCIL) { - reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) | - A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | - A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE); - } - OUT_RING(ring, reg); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); - OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); @@ -338,18 +260,19 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, 0x00000000); /* PA_CL_CLIP_CNTL */ OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES)); + A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES) | + (fast_clear ? A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE : 0)); + + if (fast_clear) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); + OUT_RING(ring, A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(3)); + } OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); OUT_RING(ring, 0x0000ffff); - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(0,0)); /* PA_SC_WINDOW_SCISSOR_TL */ - OUT_RING(ring, xy2d(fb->width, /* PA_SC_WINDOW_SCISSOR_BR */ - fb->height)); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); if (buffers & PIPE_CLEAR_COLOR) { @@ -361,30 +284,326 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, 0x0); } - if (!is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - } + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, 0); - fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + if (is_a20x(batch->ctx->screen)) + return; + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); - OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000084); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000028f); +} + +static void +clear_state_restore(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + if (is_a20x(ctx->screen)) + return; OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); OUT_RING(ring, 0x00000000); - if (!is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000003b); +} + +static void +clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring, + uint32_t color_clear, uint32_t depth_clear, unsigned patch_type) +{ + BEGIN_RING(ring, 8); /* preallocate next 2 packets (for patching) */ + + /* zero values are patched in */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); + OUT_RINGP(ring, patch_type, &batch->gmem_patches); + OUT_RING(ring, 0); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RING(ring, 0x8000 | 32); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + + /* set fill values */ + if (!is_a20x(batch->ctx->screen)) { OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); + OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); + OUT_RING(ring, color_clear); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | + A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); + OUT_RING(ring, depth_clear); + } else { + const float sc = 1.0f / 255.0f; + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000480); + OUT_RING(ring, fui((float) (color_clear >> 0 & 0xff) * sc)); + OUT_RING(ring, fui((float) (color_clear >> 8 & 0xff) * sc)); + OUT_RING(ring, fui((float) (color_clear >> 16 & 0xff) * sc)); + OUT_RING(ring, fui((float) (color_clear >> 24 & 0xff) * sc)); + + // XXX if using float the rounding error breaks it.. + float depth = ((double) (depth_clear >> 8)) * (1.0/(double) 0xffffff); + assert((unsigned) (((double) depth * (double) 0xffffff)) == + (depth_clear >> 8)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(depth)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_BF_STENCILREF(depth_clear & 0xff) | + A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_STENCILREF(depth_clear & 0xff) | + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + } + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); +} + +static bool +fd2_clear_fast(struct fd_context *ctx, unsigned buffers, + const union pipe_color_union *color, double depth, unsigned stencil) +{ + /* using 4x MSAA allows clearing ~2x faster + * then we can use higher bpp clearing to clear lower bpp + * 1 "pixel" can clear 64 bits (rgba8+depth24+stencil8) + * note: its possible to clear with 32_32_32_32 format but its not faster + * note: fast clear doesn't work with sysmem rendering + * (sysmem rendering is disabled when clear is used) + * + * we only have 16-bit / 32-bit color formats + * and 16-bit / 32-bit depth formats + * so there are only a few possible combinations + * + * if the bpp of the color/depth doesn't match + * we clear with depth/color individually + */ + struct fd2_context *fd2_ctx = fd2_context(ctx); + struct fd_batch *batch = ctx->batch; + struct fd_ringbuffer *ring = batch->draw; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + uint32_t color_clear = 0, depth_clear = 0; + enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); + int depth_size = -1; /* -1: no clear, 0: clear 16-bit, 1: clear 32-bit */ + int color_size = -1; + + /* TODO: need to test performance on a22x */ + if (!is_a20x(ctx->screen)) + return false; + + if (buffers & PIPE_CLEAR_COLOR) + color_size = util_format_get_blocksizebits(format) == 32; + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) + depth_size = fd_pipe2depth(pfb->zsbuf->format) == DEPTHX_24_8; + + assert(color_size >= 0 || depth_size >= 0); + + /* when clearing 24_8, depth/stencil must be both cleared + * TODO: if buffer isn't attached we can clear it anyway + */ + if (depth_size == 1 && !(buffers & PIPE_CLEAR_STENCIL) != !(buffers & PIPE_CLEAR_DEPTH)) + return false; + + if (color_size == 0) { + color_clear = pack_rgba(format, color->f); + color_clear = (color_clear << 16) | (color_clear & 0xffff); + } else if (color_size == 1) { + color_clear = pack_rgba(format, color->f); + } + + if (depth_size == 0) { + depth_clear = (uint32_t)(0xffff * depth); + depth_clear |= depth_clear << 16; + } else if (depth_size == 1) { + depth_clear = (((uint32_t)(0xffffff * depth)) << 8); + depth_clear |= (stencil & 0xff); + } + + /* disable "window" scissor.. */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(0, 0)); + OUT_RING(ring, xy2d(0x7fff, 0x7fff)); + + /* make sure we fill all "pixels" (in SCREEN_SCISSOR) */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + + clear_state(batch, ring, ~0u, true); + + if (color_size >= 0 && depth_size != color_size) + clear_fast(batch, ring, color_clear, color_clear, GMEM_PATCH_FASTCLEAR_COLOR); + + if (depth_size >= 0 && depth_size != color_size) + clear_fast(batch, ring, depth_clear, depth_clear, GMEM_PATCH_FASTCLEAR_DEPTH); + + if (depth_size == color_size) + clear_fast(batch, ring, color_clear, depth_clear, GMEM_PATCH_FASTCLEAR_COLOR_DEPTH); + + clear_state_restore(ctx, ring); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); + OUT_RING(ring, 0); + + /* can't patch in SCREEN_SCISSOR_BR as it can be different for each tile. + * MEM_WRITE the value in tile_renderprep, and use CP_LOAD_CONSTANT_CONTEXT + * the value is read from byte offset 60 in the given bo + */ + OUT_PKT3(ring, CP_LOAD_CONSTANT_CONTEXT, 3); + OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 0, 0, 0); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); + OUT_RING(ring, 1); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RINGP(ring, GMEM_PATCH_RESTORE_INFO, &batch->gmem_patches); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + return true; +} + +static bool +fd2_clear(struct fd_context *ctx, unsigned buffers, + const union pipe_color_union *color, double depth, unsigned stencil) +{ + struct fd_ringbuffer *ring = ctx->batch->draw; + struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer; + + if (fd2_clear_fast(ctx, buffers, color, depth, stencil)) + goto dirty; + + /* set clear value */ + if (is_a20x(ctx->screen)) { + if (buffers & PIPE_CLEAR_COLOR) { + /* C0 used by fragment shader */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000480); + OUT_RING(ring, color->ui[0]); + OUT_RING(ring, color->ui[1]); + OUT_RING(ring, color->ui[2]); + OUT_RING(ring, color->ui[3]); + } + + if (buffers & PIPE_CLEAR_DEPTH) { + /* use viewport to set depth value */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(depth)); + } + + if (buffers & PIPE_CLEAR_STENCIL) { + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_BF_STENCILREF(stencil) | + A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_STENCILREF(stencil) | + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + } + } else { + if (buffers & PIPE_CLEAR_COLOR) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); + OUT_RING(ring, pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f)); + } + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + uint32_t clear_mask, depth_clear; + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + switch (fd_pipe2depth(fb->zsbuf->format)) { + case DEPTHX_24_8: + clear_mask = ((buffers & PIPE_CLEAR_DEPTH) ? 0xe : 0) | + ((buffers & PIPE_CLEAR_STENCIL) ? 0x1 : 0); + depth_clear = (((uint32_t)(0xffffff * depth)) << 8) | + (stencil & 0xff); + break; + case DEPTHX_16: + clear_mask = 0xf; + depth_clear = (uint32_t)(0xffffffff * depth); + break; + default: + debug_assert(0); + break; + } + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | + A2XX_RB_COPY_CONTROL_CLEAR_MASK(clear_mask)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); + OUT_RING(ring, depth_clear); + } } + /* scissor state */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(0, 0)); + OUT_RING(ring, xy2d(fb->width, fb->height)); + + /* viewport state */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui((float) fb->width / 2.0)); + OUT_RING(ring, fui((float) fb->width / 2.0)); + OUT_RING(ring, fui((float) fb->height / 2.0)); + OUT_RING(ring, fui((float) fb->height / 2.0)); + + /* common state */ + clear_state(ctx->batch, ring, buffers, false); + + fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + + clear_state_restore(ctx, ring); + +dirty: ctx->dirty |= FD_DIRTY_ZSA | FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | @@ -392,7 +611,8 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, FD_DIRTY_PROG | FD_DIRTY_CONST | FD_DIRTY_BLEND | - FD_DIRTY_FRAMEBUFFER; + FD_DIRTY_FRAMEBUFFER | + FD_DIRTY_SCISSOR; ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG; ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST; diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.h b/src/gallium/drivers/freedreno/a2xx/fd2_draw.h index 1dd67e0401c..c7964756a53 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.h @@ -33,4 +33,11 @@ void fd2_draw_init(struct pipe_context *pctx); +enum { + GMEM_PATCH_FASTCLEAR_COLOR, + GMEM_PATCH_FASTCLEAR_DEPTH, + GMEM_PATCH_FASTCLEAR_COLOR_DEPTH, + GMEM_PATCH_RESTORE_INFO, +}; + #endif /* FD2_DRAW_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index 18d69444d12..805a4cf032a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -360,7 +360,7 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) { OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); - OUT_RING(ring, blend ? zsa->rb_colorcontrol | blend->rb_colorcontrol : 0); + OUT_RING(ring, zsa->rb_colorcontrol | blend->rb_colorcontrol); } if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { @@ -370,13 +370,13 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, blend ? blend->rb_blendcontrol_alpha | + OUT_RING(ring, blend->rb_blendcontrol_alpha | COND(has_alpha, blend->rb_blendcontrol_rgb) | - COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb) : 0); + COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb)); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); - OUT_RING(ring, blend ? blend->rb_colormask : 0xf); + OUT_RING(ring, blend->rb_colormask); } if (dirty & FD_DIRTY_BLEND_COLOR) { diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index 6a066a63730..17d6d6ef25a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -39,6 +39,7 @@ #include "fd2_program.h" #include "fd2_util.h" #include "fd2_zsa.h" +#include "fd2_draw.h" #include "instr-a2xx.h" static uint32_t fmt2swap(enum pipe_format format) @@ -473,6 +474,58 @@ fd2_emit_tile_init(struct fd_batch *batch) reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); OUT_RING(ring, reg); /* RB_DEPTH_INFO */ + /* fast clear patches */ + int depth_size = -1; + int color_size = -1; + + if (pfb->cbufs[0]) + color_size = util_format_get_blocksizebits(format) == 32 ? 4 : 2; + + if (pfb->zsbuf) + depth_size = fd_pipe2depth(pfb->zsbuf->format) == 1 ? 4 : 2; + + for (int i = 0; i < fd_patch_num_elements(&batch->gmem_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->gmem_patches, i); + uint32_t color_base = 0, depth_base = gmem->zsbuf_base[0]; + uint32_t size, lines; + + /* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */ + switch (patch->val) { + case GMEM_PATCH_FASTCLEAR_COLOR: + size = align(gmem->bin_w * gmem->bin_h * color_size, 0x4000); + lines = size / 1024; + depth_base = size / 2; + break; + case GMEM_PATCH_FASTCLEAR_DEPTH: + size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x4000); + lines = size / 1024; + color_base = depth_base; + depth_base = depth_base + size / 2; + break; + case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH: + lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x4000) / 1024; + break; + case GMEM_PATCH_RESTORE_INFO: + patch->cs[0] = gmem->bin_w; + patch->cs[1] = A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format)); + patch->cs[2] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); + if (pfb->zsbuf) + patch->cs[2] |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); + continue; + default: + continue; + } + + patch->cs[0] = A2XX_PA_SC_SCREEN_SCISSOR_BR_X(32) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(lines); + patch->cs[4] = A2XX_RB_COLOR_INFO_BASE(color_base) | + A2XX_RB_COLOR_INFO_FORMAT(COLORX_8_8_8_8); + patch->cs[5] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base) | + A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(1); + } + util_dynarray_resize(&batch->gmem_patches, 0); + /* set to zero, for some reason hardware doesn't like certain values */ OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); @@ -607,6 +660,7 @@ static void fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; + struct fd2_context *fd2_ctx = fd2_context(ctx); struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); @@ -624,6 +678,12 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); + /* write SCISSOR_BR to memory so fast clear path can restore from it */ + OUT_PKT3(ring, CP_MEM_WRITE, 2); + OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 60, 0, 0); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h)); + /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */ if (is_a20x(batch->ctx->screen)) { OUT_PKT3(ring, CP_SET_CONSTANT, 5); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c index a852494a8fc..a1578506c2a 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.c +++ b/src/gallium/drivers/freedreno/freedreno_batch.c @@ -90,8 +90,10 @@ batch_init(struct fd_batch *batch) util_dynarray_init(&batch->draw_patches, NULL); - if (is_a2xx(ctx->screen)) + if (is_a2xx(ctx->screen)) { util_dynarray_init(&batch->shader_patches, NULL); + util_dynarray_init(&batch->gmem_patches, NULL); + } if (is_a3xx(ctx->screen)) util_dynarray_init(&batch->rbrc_patches, NULL); @@ -167,8 +169,10 @@ batch_fini(struct fd_batch *batch) util_dynarray_fini(&batch->draw_patches); - if (is_a2xx(batch->ctx->screen)) + if (is_a2xx(batch->ctx->screen)) { util_dynarray_fini(&batch->shader_patches); + util_dynarray_fini(&batch->gmem_patches); + } if (is_a3xx(batch->ctx->screen)) util_dynarray_fini(&batch->rbrc_patches); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index 428a0279072..7b723db64af 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -145,6 +145,11 @@ struct fd_batch { */ struct util_dynarray rbrc_patches; + /* Keep track of GMEM related values that need to be patched up once we + * know the gmem layout: + */ + struct util_dynarray gmem_patches; + /* Keep track of pointer to start of MEM exports for a20x binning shaders * * this is so the end of the shader can be cut off at the right point diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index d0420b27d31..dd35dfa29fa 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -77,24 +77,25 @@ static uint32_t bin_width(struct fd_screen *screen) static uint32_t total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2], - uint32_t bin_w, uint32_t bin_h, struct fd_gmem_stateobj *gmem) + uint32_t bin_w, uint32_t bin_h, uint32_t gmem_align, + struct fd_gmem_stateobj *gmem) { uint32_t total = 0, i; for (i = 0; i < MAX_RENDER_TARGETS; i++) { if (cbuf_cpp[i]) { - gmem->cbuf_base[i] = align(total, 0x4000); + gmem->cbuf_base[i] = align(total, gmem_align); total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h; } } if (zsbuf_cpp[0]) { - gmem->zsbuf_base[0] = align(total, 0x4000); + gmem->zsbuf_base[0] = align(total, gmem_align); total = gmem->zsbuf_base[0] + zsbuf_cpp[0] * bin_w * bin_h; } if (zsbuf_cpp[1]) { - gmem->zsbuf_base[1] = align(total, 0x4000); + gmem->zsbuf_base[1] = align(total, gmem_align); total = gmem->zsbuf_base[1] + zsbuf_cpp[1] * bin_w * bin_h; } @@ -116,6 +117,7 @@ calculate_tiles(struct fd_batch *batch) uint32_t minx, miny, width, height; uint32_t nbins_x = 1, nbins_y = 1; uint32_t bin_w, bin_h; + uint32_t gmem_align = 0x4000; uint32_t max_width = bin_width(screen); uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0}; uint32_t i, j, t, xoff, yoff; @@ -178,10 +180,18 @@ calculate_tiles(struct fd_batch *batch) zsbuf_cpp[0], width, height); } + if (is_a20x(screen) && batch->cleared) { + /* under normal circumstances the requirement would be 4K + * but the fast clear path requires an alignment of 32K + */ + gmem_align = 0x8000; + } + /* then find a bin width/height that satisfies the memory * constraints: */ - while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) { + while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem_align, gmem) > + gmem_size) { if (bin_w > bin_h) { nbins_x++; bin_w = align(width / nbins_x, gmem_alignw);