freedreno/ir3: lower immeds to const
authorRob Clark <robclark@freedesktop.org>
Mon, 11 Apr 2016 18:47:19 +0000 (14:47 -0400)
committerRob Clark <robclark@freedesktop.org>
Wed, 4 May 2016 15:25:55 +0000 (11:25 -0400)
Helps reduce register pressure and instruction counts for immediates
that would otherwise require a mov into gpr.

total instructions in shared programs:          4455332 -> 4369297 (-1.93%)
total dwords in shared programs:                8807872 -> 8614432 (-2.20%)
total full registers used in shared programs:   263062 -> 250846 (-4.64%)
total half registers used in shader programs:   9845 -> 9845 (0.00%)
total const registers used in shared programs:  1029735 -> 1466993 (42.46%)

                 half       full      const      instr     dwords
    helped           0       10415           0       17861        5912
      hurt           0        1157       21458         947          33

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a3xx/fd3_emit.c
src/gallium/drivers/freedreno/a4xx/fd4_emit.c
src/gallium/drivers/freedreno/ir3/ir3_cp.c

index 4470c2ac34e229cd515860b803e642ac57fd864e..e1d0a4fee84fe496aa9a2c68b9f664536e667644 100644 (file)
@@ -659,8 +659,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
                if (!emit->key.binning_pass)
                        ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
-               /* mark clean after emitting consts: */
-               ctx->prog.dirty = 0;
+               /* mark clean after emitting consts.. a bit ugly, but since binning
+                * pass is emitted first, we want to do this only for main draw:
+                */
+               if (!emit->key.binning_pass)
+                       ctx->prog.dirty = 0;
        }
 
        if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
index 27614f07de5fa8bb7abdd0abcc610e11d1360da8..0144ba492ea20063e9e7d975e38978cd21010c51 100644 (file)
@@ -648,8 +648,11 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
                if (!emit->key.binning_pass)
                        ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
-               /* mark clean after emitting consts: */
-               ctx->prog.dirty = 0;
+               /* mark clean after emitting consts.. a bit ugly, but since binning
+                * pass is emitted first, we want to do this only for main draw:
+                */
+               if (!emit->key.binning_pass)
+                       ctx->prog.dirty = 0;
        }
 
        if ((dirty & FD_DIRTY_BLEND)) {
index 267664c961a02ad774bed696622af0b121700881..60c2830df93d9ee6633e5da2677e820d13003f47 100644 (file)
 #include "freedreno_util.h"
 
 #include "ir3.h"
+#include "ir3_shader.h"
 
 /*
  * Copy Propagate:
  */
 
 struct ir3_cp_ctx {
+       struct ir3 *shader;
        struct ir3_shader_variant *so;
+       unsigned immediate_idx;
 };
 
 /* is it a type preserving mov, with ok flags? */
@@ -233,6 +236,62 @@ static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
                *dstflags &= ~IR3_REG_SABS;
 }
 
+static struct ir3_register *
+lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
+{
+       unsigned swiz, idx, i;
+
+       reg = ir3_reg_clone(ctx->shader, reg);
+
+       /* in some cases, there are restrictions on (abs)/(neg) plus const..
+        * so just evaluate those and clear the flags:
+        */
+       if (new_flags & IR3_REG_SABS) {
+               reg->iim_val = abs(reg->iim_val);
+               new_flags &= ~IR3_REG_SABS;
+       }
+
+       if (new_flags & IR3_REG_FABS) {
+               reg->fim_val = fabs(reg->fim_val);
+               new_flags &= ~IR3_REG_FABS;
+       }
+
+       if (new_flags & IR3_REG_SNEG) {
+               reg->iim_val = -reg->iim_val;
+               new_flags &= ~IR3_REG_SNEG;
+       }
+
+       if (new_flags & IR3_REG_FNEG) {
+               reg->fim_val = -reg->fim_val;
+               new_flags &= ~IR3_REG_FNEG;
+       }
+
+       for (i = 0; i < ctx->immediate_idx; i++) {
+               swiz = i % 4;
+               idx  = i / 4;
+
+               if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
+                       break;
+               }
+       }
+
+       if (i == ctx->immediate_idx) {
+               /* need to generate a new immediate: */
+               swiz = i % 4;
+               idx  = i / 4;
+               ctx->so->immediates[idx].val[swiz] = reg->uim_val;
+               ctx->so->immediates_count = idx + 1;
+               ctx->immediate_idx++;
+       }
+
+       new_flags &= ~IR3_REG_IMMED;
+       new_flags |= IR3_REG_CONST;
+       reg->flags = new_flags;
+       reg->num = i + (4 * ctx->so->first_immediate);
+
+       return reg;
+}
+
 /**
  * Handle cp for a given src register.  This additionally handles
  * the cases of collapsing immedate/const (which replace the src
@@ -281,6 +340,13 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
                combine_flags(&new_flags, src);
 
                if (!valid_flags(instr, n, new_flags)) {
+                       /* See if lowering an immediate to const would help. */
+                       if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+                               debug_assert(new_flags & IR3_REG_IMMED);
+                               instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
+                               return;
+                       }
+
                        /* special case for "normal" mad instructions, we can
                         * try swapping the first two args if that fits better.
                         *
@@ -378,6 +444,9 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
                                src_reg->flags = new_flags;
                                src_reg->iim_val = iim_val;
                                instr->regs[n+1] = src_reg;
+                       } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+                               /* See if lowering an immediate to const would help. */
+                               instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
                        }
 
                        return;
@@ -484,6 +553,7 @@ void
 ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
 {
        struct ir3_cp_ctx ctx = {
+                       .shader = ir,
                        .so = so,
        };