From 848c0e72f36d0e1e460193a2d30b2f631529156f Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Thu, 12 Sep 2013 13:00:52 +0800 Subject: [PATCH] i965: compute DDX in a subspan based only on top row Consider only the top-left and top-right pixels to approximate DDX in a 2x2 subspan, unless the application requests a more accurate approximation via GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the new driconf option disable_derivative_optimization. This results in a less accurate approximation. However, it improves the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0% confidence) on Haswell. No noticeable image quality difference observed. The improvement comes from faster sample_d. It seems, on Haswell, some optimizations are introduced to allow faster sample_d when all pixels in a subspan have the same derivative. I considered SAMPLE_STATE too, which allows one to control the quality of sample_d on Haswell. But it gave much worse image quality without giving better performance comparing to this change. No piglit quick.tests regression on Haswell (tested with v1). v2: better guess for precompile program key Signed-off-by: Chia-I Wu Reviewed-by: Kenneth Graunke Reviewed-by: Ian Romanick Reviewed-by: Chris Forbes --- src/mesa/drivers/dri/i965/brw_context.c | 2 ++ src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++++ .../drivers/dri/i965/brw_fs_generator.cpp | 33 ++++++++++++++----- src/mesa/drivers/dri/i965/brw_wm.c | 10 ++++++ src/mesa/drivers/dri/i965/brw_wm.h | 1 + src/mesa/drivers/dri/i965/intel_screen.c | 4 +++ 7 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 5f58a291d8d..18b8e573b59 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -478,6 +478,8 @@ brwCreateContext(int api, brw_draw_init( brw ); brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile"); + brw->disable_derivative_optimization = + driQueryOptionb(&brw->optionCache, "disable_derivative_optimization"); ctx->Const.ContextFlags = 0; if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0f88bad2475..0ec12185ee4 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1005,6 +1005,7 @@ struct brw_context bool always_flush_cache; bool disable_throttling; bool precompile; + bool disable_derivative_optimization; driOptionCache optionCache; /** @} */ diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index bcb15ee0f92..0b441d451f1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3213,6 +3213,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) key.nr_color_regions = 1; + /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The + * quality of the derivatives is likely to be determined by the driconf + * option. + */ + key.high_quality_derivatives = brw->disable_derivative_optimization; + key.program_string_id = bfp->id; uint32_t old_prog_offset = brw->wm.base.prog_offset; diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 7ce42c4b9dc..9eb5e177928 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br * - * and we're trying to produce: + * Ideally, we want to produce: * * DDX DDY * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result * for each pair, and vertstride = 2 jumps us 2 elements after processing a - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled - * between each other. We could probably do it like ddx and swizzle the right - * order later, but bail for now and just produce + * pair. But the ideal approximation may impose a huge performance cost on + * sample_d. On at least Haswell, sample_d instruction does some + * optimizations if the same LOD is used for all pixels in the subspan. + * + * For DDY, it's harder, as we want to produce the pairs swizzled between each + * other. We could probably do it like ddx and swizzle the right order later, + * but bail for now and just produce * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) */ void fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { + unsigned vstride, width; + + if (c->key.high_quality_derivatives) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } + else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + struct brw_reg src0 = brw_reg(src.file, src.nr, 1, BRW_REGISTER_TYPE_F, - BRW_VERTICAL_STRIDE_2, - BRW_WIDTH_2, + vstride, + width, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); struct brw_reg src1 = brw_reg(src.file, src.nr, 0, BRW_REGISTER_TYPE_F, - BRW_VERTICAL_STRIDE_2, - BRW_WIDTH_2, + vstride, + width, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); brw_ADD(p, dst, src0, negate(src1)); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 3d7ca2a4d03..9745edac6ca 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -416,6 +416,15 @@ static void brw_wm_populate_key( struct brw_context *brw, key->line_aa = line_aa; + /* _NEW_HINT */ + if (brw->disable_derivative_optimization) { + key->high_quality_derivatives = + ctx->Hint.FragmentShaderDerivative != GL_FASTEST; + } else { + key->high_quality_derivatives = + ctx->Hint.FragmentShaderDerivative == GL_NICEST; + } + if (brw->gen < 6) key->stats_wm = brw->stats_wm; @@ -503,6 +512,7 @@ const struct brw_tracked_state brw_wm_prog = { _NEW_STENCIL | _NEW_POLYGON | _NEW_LINE | + _NEW_HINT | _NEW_LIGHT | _NEW_FRAG_CLAMP | _NEW_BUFFERS | diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index f7a2c5f234d..aa786def463 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -66,6 +66,7 @@ struct brw_wm_prog_key { GLuint render_to_fbo:1; GLuint clamp_fragment_color:1; GLuint line_aa:2; + GLuint high_quality_derivatives:1; GLushort drawable_height; GLbitfield64 input_slots_valid; diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index de80a00ebb3..cddc8e81330 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] = DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+") DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false") + DRI_CONF_DESC(en, "Derivatives with finer granularity by default") + DRI_CONF_OPT_END + DRI_CONF_SECTION_END DRI_CONF_SECTION_QUALITY DRI_CONF_FORCE_S3TC_ENABLE("false") -- 2.30.2