freedreno/a3xx/compiler: half-precision output

author Rob Clark <robclark@freedesktop.org>

Sat, 22 Feb 2014 14:46:39 +0000 (09:46 -0500)

committer Rob Clark <robclark@freedesktop.org>

Sun, 23 Feb 2014 19:58:24 +0000 (14:58 -0500)
author Rob Clark <robclark@freedesktop.org>
Sat, 22 Feb 2014 14:46:39 +0000 (09:46 -0500)
committer Rob Clark <robclark@freedesktop.org>
Sun, 23 Feb 2014 19:58:24 +0000 (14:58 -0500)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c

index f52003a47ee791a5ded6418af147a3ea7e6ee932..818d5611dd9ebb802e484e3219e54d8ac6ac515a 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -48,6 +48,25 @@
  #include "instr-a3xx.h"
  #include "ir3.h"
  
+/* NOTE on half/full precision:
+ * Currently, the front end (ie. basically this file) does everything in
+ * full precision (with the exception of trans_arl() which doesn't work
+ * currently.. we reject anything with relative addressing and fallback
+ * to old compiler).
+ *
+ * In the RA step, if half_precision, it will assign the output to hr0.x
+ * but use full precision everywhere else.
+ *
+ * Eventually we'll need a better way to communicate type information
+ * to RA so that it can more properly assign both half and full precision
+ * registers.  (And presumably double precision pairs for a4xx?)  This
+ * would let us make more use of half precision registers, while still
+ * keeping things like tex coords in full precision registers.
+ *
+ * Since the RA is dealing with patching instruction types for half
+ * precision output, we can ignore that in the front end and just always
+ * create full precision instructions.
+ */
  
  struct fd3_compile_context {
         const struct tgsi_token *tokens;
@@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
                 ir3_dump_instr_list(ctx.block->head);
         }
  
-       ret = ir3_block_ra(ctx.block, so->type);
+       ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
         if (ret)
                 goto out;
  
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c

index 7b071b2cd5dd26d2f60df961673814c1bf026ddc..f822aa728fe1e5deb628e729769c1cce52f072f9 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
                         /* do binning pass first: */
                         .binning_pass = true,
                         .color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+                       // TODO set .half_precision based on render target format,
+                       // ie. float16 and smaller use half, float32 use full..
+                       .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
         };
         draw_impl(ctx, info, ctx->binning_ring,
                         dirty & ~(FD_DIRTY_BLEND), key);
@@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
         struct fd_ringbuffer *ring = ctx->binning_ring;
         struct fd3_shader_key key = {
                         .binning_pass = true,
+                       .half_precision = true,
         };
  
         fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
         unsigned dirty = ctx->dirty;
         unsigned ce, i;
         struct fd3_shader_key key = {
+                       .half_precision = true,
         };
  
         dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c

index d1aa8cf120899cc0041bb1a3d01637d59e652dca..dde71ba97b9eb9bc69f3b8909d4390d716398cf1 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -44,6 +44,9 @@
  #include "fd3_zsa.h"
  
  static const struct fd3_shader_key key = {
+               // XXX should set this based on render target format!  We don't
+               // want half_precision if float32 render target!!!
+               .half_precision = true,
  };
  
  static void
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c

index 0a7500f1611dd3098d545c3e5bf1c3afde8cbfa5..34d4dd3330b0bb551bbfabb488da88629429ea9b 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
         v->type = so->type;
  
         if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("dump tgsi: type=%d", so->type);
+               DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
                 tgsi_dump(tokens, 0);
         }
  
@@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
                 fixup_vp_regfootprint(v);
  
         if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("disassemble: type=%d", v->type);
+               DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
                 disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
         }
  
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h

index 894db175076a5275f2fea4abca64b9157a158667..9327fbdca7251e66cd24c869d8f7308f81339ee4 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
  void ir3_block_sched(struct ir3_block *block);
  
  /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision);
  
  
  #ifndef ARRAY_SIZE
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c

index 06a86ff3b2d9dfc4ecc055d1ec1511d5df2daa7c..1b3d0e3e1e50383bc0eec7c92bf8fcd081cdaf4b 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -53,10 +53,19 @@
  struct ir3_ra_ctx {
         struct ir3_block *block;
         enum shader_t type;
+       bool half_precision;
         int cnt;
         bool error;
  };
  
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
  struct ir3_ra_assignment {
         int8_t  off;        /* offset of instruction dst within range */
         uint8_t num;        /* number of components for the range */
@@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
          * see how because the blob driver always uses r0.x (ie.
          * all zeros)
          */
-       if (ctx->type == SHADER_FRAGMENT)
+       if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
                 return 2;
         return 0;
  }
@@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
         return (struct ra_assign_visitor *)v;
  }
  
+static type_t half_type(type_t type)
+{
+       switch (type) {
+       case TYPE_F32: return TYPE_F16;
+       case TYPE_U32: return TYPE_U16;
+       case TYPE_S32: return TYPE_S16;
+       /* instructions may already be fixed up: */
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return type;
+       default:
+               assert(0);
+               return ~0;
+       }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+               break;
+       case 3:
+               switch (instr->opc) {
+               case OPC_MAD_F32:
+                       instr->opc = OPC_MAD_F16;
+                       break;
+               case OPC_SEL_B32:
+                       instr->opc = OPC_SEL_B16;
+                       break;
+               case OPC_SEL_S32:
+                       instr->opc = OPC_SEL_S16;
+                       break;
+               case OPC_SEL_F32:
+                       instr->opc = OPC_SEL_F16;
+                       break;
+               case OPC_SAD_S32:
+                       instr->opc = OPC_SAD_S16;
+                       break;
+               /* instructions may already be fixed up: */
+               case OPC_MAD_F16:
+               case OPC_SEL_B16:
+               case OPC_SEL_S16:
+               case OPC_SEL_F16:
+               case OPC_SAD_S16:
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+               break;
+       case 5:
+               instr->cat5.type = half_type(instr->cat5.type);
+               break;
+       }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.src_type = half_type(instr->cat1.src_type);
+               break;
+       }
+}
+
  static void ra_assign_reg(struct ir3_visitor *v,
                 struct ir3_instruction *instr, struct ir3_register *reg)
  {
         struct ra_assign_visitor *a = ra_assign_visitor(v);
         reg->flags &= ~IR3_REG_SSA;
-       reg->num = a->num;
+       reg->num = a->num & ~REG_HALF;
+       if (a->num & REG_HALF) {
+               reg->flags |= IR3_REG_HALF;
+               /* if dst reg being assigned, patch up the instr: */
+               if (reg == instr->regs[0])
+                       fixup_half_instr_dst(instr);
+               else
+                       fixup_half_instr_src(instr);
+       }
  }
  
  static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
  
         /* if we've already visited this instruction, bail now: */
         if (ir3_instr_check_mark(assigner)) {
-               debug_assert(assigner->regs[0]->num == num);
-               if (assigner->regs[0]->num != num) {
+               debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+               if (assigner->regs[0]->num != (num & ~REG_HALF)) {
                         /* impossible situation, should have been resolved
                          * at an earlier stage by inserting extra mov's:
                          */
@@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  
                 base = alloc_block(ctx, NULL, block->noutputs + off);
  
+               if (ctx->half_precision)
+                       base |= REG_HALF;
+
                 for (i = 0; i < block->noutputs; i++)
                         if (block->outputs[i])
                                 ra_assign(ctx, block->outputs[i], base + i + off);
@@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                 if (ctx->type == SHADER_FRAGMENT) {
                         for (i = 0; i < block->ninputs; i++)
                                 if (block->inputs[i])
-                                       ra_assign(ctx, block->inputs[i], base + i);
+                                       ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
                 } else {
                         for (i = 0; i < block->ninputs; i++)
                                 if (block->inputs[i])
@@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
         return 0;
  }
  
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision)
  {
         struct ir3_ra_ctx ctx = {
                         .block = block,
                         .type = type,
+                       .half_precision = half_precision,
         };
         ir3_shader_clear_mark(block->shader);
         return block_ra(&ctx, block);
author	Rob Clark <robclark@freedesktop.org>
	Sat, 22 Feb 2014 14:46:39 +0000 (09:46 -0500)
committer	Rob Clark <robclark@freedesktop.org>
	Sun, 23 Feb 2014 19:58:24 +0000 (14:58 -0500)
src/gallium/drivers/freedreno/a3xx/fd3_compiler.c		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_draw.c		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_gmem.c		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_program.c		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/ir3.h		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/ir3_ra.c		patch \| blob \| history