freedreno/a3xx/compiler: half-precision output
authorRob Clark <robclark@freedesktop.org>
Sat, 22 Feb 2014 14:46:39 +0000 (09:46 -0500)
committerRob Clark <robclark@freedesktop.org>
Sun, 23 Feb 2014 19:58:24 +0000 (14:58 -0500)
Using generic shaders caused a measurable fps drop, which was isolated to
use of full precision (vs half precision) output.  This is an attempt to
regain that lost performance by using half precision solid/blit shaders
(when the output format is not float32).

Note: for the built-in shaders, I would not expect them to be register
starved.  And in fact it is the solid frag shader that seems to have the
biggest impact.  So I suspect you get double the pixel pipe units (or
half the cycles) when the output is half precision.  So there may be
some gain to using half precision output for application shaders as
well, even though the rest of register usage is still full precision.
But for half precision to work for more complex shaders, we need to deal
with some constraints, like cat2 needing same precision for it's two src
registers.  So for now it is not enabled by default except for the
built-in shaders.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
src/gallium/drivers/freedreno/a3xx/fd3_draw.c
src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
src/gallium/drivers/freedreno/a3xx/fd3_program.c
src/gallium/drivers/freedreno/a3xx/ir3.h
src/gallium/drivers/freedreno/a3xx/ir3_ra.c

index f52003a47ee791a5ded6418af147a3ea7e6ee932..818d5611dd9ebb802e484e3219e54d8ac6ac515a 100644 (file)
 #include "instr-a3xx.h"
 #include "ir3.h"
 
+/* NOTE on half/full precision:
+ * Currently, the front end (ie. basically this file) does everything in
+ * full precision (with the exception of trans_arl() which doesn't work
+ * currently.. we reject anything with relative addressing and fallback
+ * to old compiler).
+ *
+ * In the RA step, if half_precision, it will assign the output to hr0.x
+ * but use full precision everywhere else.
+ *
+ * Eventually we'll need a better way to communicate type information
+ * to RA so that it can more properly assign both half and full precision
+ * registers.  (And presumably double precision pairs for a4xx?)  This
+ * would let us make more use of half precision registers, while still
+ * keeping things like tex coords in full precision registers.
+ *
+ * Since the RA is dealing with patching instruction types for half
+ * precision output, we can ignore that in the front end and just always
+ * create full precision instructions.
+ */
 
 struct fd3_compile_context {
        const struct tgsi_token *tokens;
@@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
                ir3_dump_instr_list(ctx.block->head);
        }
 
-       ret = ir3_block_ra(ctx.block, so->type);
+       ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
        if (ret)
                goto out;
 
index 7b071b2cd5dd26d2f60df961673814c1bf026ddc..f822aa728fe1e5deb628e729769c1cce52f072f9 100644 (file)
@@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
                        /* do binning pass first: */
                        .binning_pass = true,
                        .color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+                       // TODO set .half_precision based on render target format,
+                       // ie. float16 and smaller use half, float32 use full..
+                       .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
        };
        draw_impl(ctx, info, ctx->binning_ring,
                        dirty & ~(FD_DIRTY_BLEND), key);
@@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
        struct fd_ringbuffer *ring = ctx->binning_ring;
        struct fd3_shader_key key = {
                        .binning_pass = true,
+                       .half_precision = true,
        };
 
        fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
        unsigned dirty = ctx->dirty;
        unsigned ce, i;
        struct fd3_shader_key key = {
+                       .half_precision = true,
        };
 
        dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
index d1aa8cf120899cc0041bb1a3d01637d59e652dca..dde71ba97b9eb9bc69f3b8909d4390d716398cf1 100644 (file)
@@ -44,6 +44,9 @@
 #include "fd3_zsa.h"
 
 static const struct fd3_shader_key key = {
+               // XXX should set this based on render target format!  We don't
+               // want half_precision if float32 render target!!!
+               .half_precision = true,
 };
 
 static void
index 0a7500f1611dd3098d545c3e5bf1c3afde8cbfa5..34d4dd3330b0bb551bbfabb488da88629429ea9b 100644 (file)
@@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
        v->type = so->type;
 
        if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("dump tgsi: type=%d", so->type);
+               DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
                tgsi_dump(tokens, 0);
        }
 
@@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
                fixup_vp_regfootprint(v);
 
        if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("disassemble: type=%d", v->type);
+               DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
                disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
        }
 
index 894db175076a5275f2fea4abca64b9157a158667..9327fbdca7251e66cd24c869d8f7308f81339ee4 100644 (file)
@@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
 void ir3_block_sched(struct ir3_block *block);
 
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision);
 
 
 #ifndef ARRAY_SIZE
index 06a86ff3b2d9dfc4ecc055d1ec1511d5df2daa7c..1b3d0e3e1e50383bc0eec7c92bf8fcd081cdaf4b 100644 (file)
 struct ir3_ra_ctx {
        struct ir3_block *block;
        enum shader_t type;
+       bool half_precision;
        int cnt;
        bool error;
 };
 
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
 struct ir3_ra_assignment {
        int8_t  off;        /* offset of instruction dst within range */
        uint8_t num;        /* number of components for the range */
@@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
         * see how because the blob driver always uses r0.x (ie.
         * all zeros)
         */
-       if (ctx->type == SHADER_FRAGMENT)
+       if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
                return 2;
        return 0;
 }
@@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
        return (struct ra_assign_visitor *)v;
 }
 
+static type_t half_type(type_t type)
+{
+       switch (type) {
+       case TYPE_F32: return TYPE_F16;
+       case TYPE_U32: return TYPE_U16;
+       case TYPE_S32: return TYPE_S16;
+       /* instructions may already be fixed up: */
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return type;
+       default:
+               assert(0);
+               return ~0;
+       }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+               break;
+       case 3:
+               switch (instr->opc) {
+               case OPC_MAD_F32:
+                       instr->opc = OPC_MAD_F16;
+                       break;
+               case OPC_SEL_B32:
+                       instr->opc = OPC_SEL_B16;
+                       break;
+               case OPC_SEL_S32:
+                       instr->opc = OPC_SEL_S16;
+                       break;
+               case OPC_SEL_F32:
+                       instr->opc = OPC_SEL_F16;
+                       break;
+               case OPC_SAD_S32:
+                       instr->opc = OPC_SAD_S16;
+                       break;
+               /* instructions may already be fixed up: */
+               case OPC_MAD_F16:
+               case OPC_SEL_B16:
+               case OPC_SEL_S16:
+               case OPC_SEL_F16:
+               case OPC_SAD_S16:
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+               break;
+       case 5:
+               instr->cat5.type = half_type(instr->cat5.type);
+               break;
+       }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.src_type = half_type(instr->cat1.src_type);
+               break;
+       }
+}
+
 static void ra_assign_reg(struct ir3_visitor *v,
                struct ir3_instruction *instr, struct ir3_register *reg)
 {
        struct ra_assign_visitor *a = ra_assign_visitor(v);
        reg->flags &= ~IR3_REG_SSA;
-       reg->num = a->num;
+       reg->num = a->num & ~REG_HALF;
+       if (a->num & REG_HALF) {
+               reg->flags |= IR3_REG_HALF;
+               /* if dst reg being assigned, patch up the instr: */
+               if (reg == instr->regs[0])
+                       fixup_half_instr_dst(instr);
+               else
+                       fixup_half_instr_src(instr);
+       }
 }
 
 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
 
        /* if we've already visited this instruction, bail now: */
        if (ir3_instr_check_mark(assigner)) {
-               debug_assert(assigner->regs[0]->num == num);
-               if (assigner->regs[0]->num != num) {
+               debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+               if (assigner->regs[0]->num != (num & ~REG_HALF)) {
                        /* impossible situation, should have been resolved
                         * at an earlier stage by inserting extra mov's:
                         */
@@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
                base = alloc_block(ctx, NULL, block->noutputs + off);
 
+               if (ctx->half_precision)
+                       base |= REG_HALF;
+
                for (i = 0; i < block->noutputs; i++)
                        if (block->outputs[i])
                                ra_assign(ctx, block->outputs[i], base + i + off);
@@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                if (ctx->type == SHADER_FRAGMENT) {
                        for (i = 0; i < block->ninputs; i++)
                                if (block->inputs[i])
-                                       ra_assign(ctx, block->inputs[i], base + i);
+                                       ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
                } else {
                        for (i = 0; i < block->ninputs; i++)
                                if (block->inputs[i])
@@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
        return 0;
 }
 
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision)
 {
        struct ir3_ra_ctx ctx = {
                        .block = block,
                        .type = type,
+                       .half_precision = half_precision,
        };
        ir3_shader_clear_mark(block->shader);
        return block_ra(&ctx, block);