freedreno/ir3: relative dst

author Rob Clark <robclark@freedesktop.org>

Fri, 23 Jan 2015 20:04:46 +0000 (15:04 -0500)

committer Rob Clark <robclark@freedesktop.org>

Sun, 8 Mar 2015 21:42:43 +0000 (17:42 -0400)
author Rob Clark <robclark@freedesktop.org>
Fri, 23 Jan 2015 20:04:46 +0000 (15:04 -0500)
committer Rob Clark <robclark@freedesktop.org>
Sun, 8 Mar 2015 21:42:43 +0000 (17:42 -0400)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h

index 30932854884427ccf09ec5a514619cdf89342dc9..430bcf22d6fd4d120751c3ef1b489e59d1327142 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -204,6 +204,9 @@ struct ir3_instruction {
                 struct {
                         int off;              /* component/offset */
                 } fo;
+               struct {
+                       int aid;
+               } fi;
                 struct {
                         struct ir3_block *if_block, *else_block;
                 } flow;
@@ -264,6 +267,19 @@ struct ir3_instruction {
          */
         struct ir3_instruction *address;
  
+       /* in case of a instruction with relative dst instruction, we need to
+        * capture the dependency on the fanin for the previous values of
+        * the array elements.  Since we don't know at compile time actually
+        * which array elements are written, this serves to preserve the
+        * unconditional write to array elements prior to the conditional
+        * write.
+        *
+        * TODO only cat1 can do indirect write.. we could maybe move this
+        * into instr->cat1.fanin (but would require the frontend to insert
+        * the extra mov)
+        */
+       struct ir3_instruction *fanin;
+
         struct ir3_instruction *next;
  #ifdef DEBUG
         uint32_t serialno;
@@ -373,6 +389,8 @@ static inline int ir3_instr_regno(struct ir3_instruction *instr,
  }
  
  
+#define MAX_ARRAYS 16
+
  /* comp:
   *   0 - x
   *   1 - y
@@ -498,6 +516,8 @@ static inline bool reg_gpr(struct ir3_register *r)
  
  static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
  {
+       if (instr->fanin)
+               return instr->regs_count + 2;
         if (instr->address)
                 return instr->regs_count + 1;
         return instr->regs_count;
@@ -505,6 +525,8 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
  
  static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
  {
+       if (n == (instr->regs_count + 1))
+               return instr->fanin;
         if (n == (instr->regs_count + 0))
                 return instr->address;
         return ssa(instr->regs[n]);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c

index d755babf31b1444df9112e23f8a6fd97985ac4ca..df428ebed7efeeac53bf70bcb02adce27ba5ffc7 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -64,7 +64,7 @@ struct ir3_compile_context {
          */
         struct {
                 struct ir3_instruction *instr, **instrp;
-       } output_updates[16];
+       } output_updates[64];
         unsigned num_output_updates;
  
         /* are we in a sequence of "atomic" instructions?
@@ -97,7 +97,7 @@ struct ir3_compile_context {
         struct {
                 unsigned first, last;
                 struct ir3_instruction *fanin;
-       } array[16];
+       } array[MAX_ARRAYS];
         uint32_t array_dirty;
         /* offset into array[], per file, of first array info */
         uint8_t array_offsets[TGSI_FILE_COUNT];
@@ -247,10 +247,6 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
         memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
  
  #define FM(x) (1 << TGSI_FILE_##x)
-       /* optimize can't deal with relative addressing: */
-       if (info->indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-               return TGSI_PARSE_ERROR;
-
         /* NOTE: if relative addressing is used, we set constlen in
          * the compiler (to worst-case value) since we don't know in
          * the assembler what the max addr reg value can be:
@@ -595,6 +591,16 @@ ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
         return instr;
  }
  
+static int dst_array_id(struct ir3_compile_context *ctx,
+               const struct tgsi_dst_register *dst)
+{
+       // XXX complete hack to recover tgsi_full_dst_register...
+       // nothing that isn't wrapped in a tgsi_full_dst_register
+       // should be indirect
+       const struct tgsi_full_dst_register *fdst = (const void *)dst;
+       return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
+}
+
  static int src_array_id(struct ir3_compile_context *ctx,
                 const struct tgsi_src_register *src)
  {
@@ -639,7 +645,56 @@ static void
  ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
                 const struct tgsi_dst_register *dst, unsigned chan)
  {
-       ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+       if (dst->Indirect) {
+               struct ir3_register *reg = instr->regs[0];
+               unsigned i, aid = dst_array_id(ctx, dst);
+               unsigned first = ctx->array[aid].first;
+               unsigned last  = ctx->array[aid].last;
+               unsigned off   = dst->Index - first; /* vec4 offset */
+
+               reg->size = 4 * (1 + last - first);
+               reg->offset = regid(off, chan);
+
+               instr->fanin = array_fanin(ctx, aid, dst->File);
+
+               /* annotate with the array-id, to help out the register-
+                * assignment stage.  At least for the case of indirect
+                * writes, we should capture enough dependencies to
+                * preserve the order of reads/writes of the array, so
+                * the multiple "names" for the array should end up all
+                * assigned to the same registers.
+                */
+               instr->fanin->fi.aid = aid;
+
+               /* Since we are scalarizing vec4 tgsi instructions/regs, we
+                * run into a slight complication here.  To do the naive thing
+                * and setup a fanout for each scalar array element would end
+                * up with the result that the instructions generated for each
+                * component of the vec4 would end up clobbering each other.
+                * So we take advantage here of knowing that the array index
+                * (after the shl.b) will be a multiple of four, and only set
+                * every fourth scalar component in the array.  See also
+                * fixup_ssa_dst_array()
+                */
+               for (i = first; i <= last; i++) {
+                       struct ir3_instruction *split;
+                       unsigned n = regid(i, chan);
+                       int off = (4 * (i - first)) + chan;
+
+                       if (is_meta(instr) && (instr->opc == OPC_META_FO))
+                               off -= instr->fo.off;
+
+                       split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
+                       split->fo.off = off;
+                       ir3_reg_create(split, 0, 0);
+                       ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
+
+                       ssa_instr_set(ctx, dst->File, n, split);
+               }
+       } else {
+               /* normal case (not relative addressed GPR) */
+               ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+       }
  }
  
  static void
@@ -705,12 +760,22 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
                 break;
         }
  
-       if (dst->Indirect)
+       if (dst->Indirect) {
                 flags |= IR3_REG_RELATIV;
  
-       reg = ir3_reg_create(instr, regid(num, chan), flags);
+               /* shouldn't happen, and we can't cope with it below: */
+               compile_assert(ctx, wrmask == 0x1);
+
+               compile_assert(ctx, ctx->block->address);
+               if (instr->address)
+                       compile_assert(ctx, ctx->block->address == instr->address);
+
+               instr->address = ctx->block->address;
+       }
  
+       reg = ir3_reg_create(instr, regid(num, chan), flags);
         reg->wrmask = wrmask;
+
         if (wrmask == 0x1) {
                 /* normal case */
                 ssa_dst(ctx, instr, dst, chan);
@@ -720,6 +785,8 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
                 struct ir3_instruction *prev = NULL;
                 unsigned i;
  
+               compile_assert(ctx, !dst->Indirect);
+
                 /* if instruction writes multiple, we need to create
                  * some place-holder collect the registers:
                  */
@@ -2539,10 +2606,16 @@ instr_cat1(const struct instr_translater *t,
                 struct ir3_compile_context *ctx,
                 struct tgsi_full_instruction *inst)
  {
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
         struct tgsi_src_register *src = &inst->Src[0].Register;
+
+       /* NOTE: atomic start/end, rather than in create_mov() since
+        * create_mov() is used already w/in atomic sequences (and
+        * we aren't clever enough to deal with the nesting)
+        */
+       instr_atomic_start(ctx);
         create_mov(ctx, dst, src);
-       put_dst(ctx, inst, dst);
+       instr_atomic_end(ctx);
  }
  
  static void
@@ -3322,6 +3395,10 @@ ir3_compile_shader(struct ir3_shader_variant *so,
                 goto out;
         }
  
+       /* for now, until the edge cases are worked out: */
+       if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+               cp = false;
+
         compile_instructions(&ctx);
  
         block = ctx.block;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c

index 898ed70abeb0b3fe7449a76db6b2160389ccd762..8ad1894a324d95deb1a920afd08df40c111cbb6a 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -49,7 +49,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
                 struct ir3_register *dst = instr->regs[0];
                 struct ir3_register *src = instr->regs[1];
                 struct ir3_instruction *src_instr = ssa(src);
-               if (dst->flags & IR3_REG_ADDR)
+               if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV))
                         return false;
                 /* TODO: propagate abs/neg modifiers if possible */
                 if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c

index a846777b879044a76100d2928ac02ebb7df3d4d3..dc251653fa5251cabb39e4f6b608e0582a8ef14c 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
@@ -414,8 +414,20 @@ ir3_dump_instr_single(struct ir3_instruction *instr)
                 fprintf(ctx.f, "]");
         }
  
-       if (is_meta(instr) && (instr->opc == OPC_META_FO))
-               printf(", off=%d", instr->fo.off);
+       if (instr->fanin) {
+               fprintf(ctx.f, ", fanin=_");
+               fprintf(ctx.f, "[");
+               dump_instr_name(&ctx, instr->fanin);
+               fprintf(ctx.f, "]");
+       }
+
+       if (is_meta(instr)) {
+               if (instr->opc == OPC_META_FO) {
+                       printf(", off=%d", instr->fo.off);
+               } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+                       printf(", aid=%d", instr->fi.aid);
+               }
+       }
  
         printf("\n");
  }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c

index 0f6d40f5a7cf9e5b6f12a14d8dd5ce78ebcacb8f..a4235a77a15227a86f5b5c35cfc4b785c2ad2317 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -47,6 +47,12 @@
   * I'm not really sure a sane way for the CP stage to realize when it
   * cannot remove a mov due to multi-register constraints..
   *
+ * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has
+ * some ideas to handle array allocation with a more conventional
+ * graph coloring algorithm for register assignment, which might be
+ * a good alternative to the current algo.  However afaict it cannot
+ * handle overlapping arrays, which is a scenario that we have to
+ * deal with
   */
  
  struct ir3_ra_ctx {
@@ -56,6 +62,10 @@ struct ir3_ra_ctx {
         bool frag_face;
         int cnt;
         bool error;
+       struct {
+               unsigned base;
+               unsigned size;
+       } arrays[MAX_ARRAYS];
  };
  
  #ifdef DEBUG
@@ -141,6 +151,26 @@ static bool instr_is_output(struct ir3_instruction *instr)
         return false;
  }
  
+static void mark_sources(struct ir3_instruction *instr,
+               struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written)
+{
+       unsigned i;
+
+       for (i = 1; i < n->regs_count; i++) {
+               struct ir3_register *r = reg_check(n, i);
+               if (r)
+                       regmask_set_if_not(liveregs, r, written);
+
+               /* if any src points back to the instruction(s) in
+                * the block of neighbors that we are assigning then
+                * mark any written (clobbered) registers as live:
+                */
+               if (instr_used_by(instr, n->regs[i]))
+                       regmask_or(liveregs, liveregs, written);
+       }
+
+}
+
  /* live means read before written */
  static void compute_liveregs(struct ir3_ra_ctx *ctx,
                 struct ir3_instruction *instr, regmask_t *liveregs)
@@ -159,18 +189,13 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
                         continue;
  
                 /* check first src's read: */
-               for (i = 1; i < n->regs_count; i++) {
-                       r = reg_check(n, i);
-                       if (r)
-                               regmask_set_if_not(liveregs, r, &written);
-
-                       /* if any src points back to the instruction(s) in
-                        * the block of neighbors that we are assigning then
-                        * mark any written (clobbered) registers as live:
-                        */
-                       if (instr_used_by(instr, n->regs[i]))
-                               regmask_or(liveregs, liveregs, &written);
-               }
+               mark_sources(instr, n, liveregs, &written);
+
+               /* for instructions that write to an array, we need to
+                * capture the dependency on the array elements:
+                */
+               if (n->fanin)
+                       mark_sources(instr, n->fanin, liveregs, &written);
  
                 /* meta-instructions don't actually get scheduled,
                  * so don't let it's write confuse us.. what we
@@ -383,14 +408,32 @@ static void instr_assign_src(struct ir3_ra_ctx *ctx,
         }
  }
  
-static void instr_assign(struct ir3_ra_ctx *ctx,
+static void instr_assign_srcs(struct ir3_ra_ctx *ctx,
                 struct ir3_instruction *instr, unsigned name)
  {
         struct ir3_instruction *n, *src;
+
+       for (n = instr->next; n && !ctx->error; n = n->next) {
+               foreach_ssa_src_n(src, i, n) {
+                       unsigned r = i + 1;
+
+                       /* skip address / etc (non real sources): */
+                       if (r >= n->regs_count)
+                               continue;
+
+                       if (src == instr)
+                               instr_assign_src(ctx, n, r, name);
+               }
+       }
+}
+
+static void instr_assign(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr, unsigned name)
+{
         struct ir3_register *reg = instr->regs[0];
  
-       if ((reg->flags & IR3_REG_RELATIV))
-               name += reg->offset;
+       if (reg->flags & IR3_REG_RELATIV)
+               return;
  
         /* check if already assigned: */
         if (!(reg->flags & IR3_REG_SSA)) {
@@ -403,18 +446,7 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
         reg_assign(instr, 0, name);
  
         /* and rename any subsequent use of result of this instr: */
-       for (n = instr->next; n && !ctx->error; n = n->next) {
-               foreach_ssa_src_n(src, i, n) {
-                       unsigned r = i + 1;
-
-                       /* skip address / etc (non real sources): */
-                       if (r >= n->regs_count)
-                               continue;
-
-                       if (src == instr)
-                               instr_assign_src(ctx, n, r, name);
-               }
-       }
+       instr_assign_srcs(ctx, instr, name);
  
         /* To simplify the neighbor logic, and to "avoid" dealing with
          * instructions which write more than one output, we actually
@@ -423,6 +455,8 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
          * to the actual instruction:
          */
         if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
+               struct ir3_instruction *src;
+
                 debug_assert(name >= instr->fo.off);
  
                 foreach_ssa_src(src, instr)
@@ -444,7 +478,8 @@ static int check_partial_assignment(struct ir3_ra_ctx *ctx,
  
         for (n = instr; n; n = n->cp.right) {
                 struct ir3_register *dst = n->regs[0];
-               if (!(dst->flags & IR3_REG_SSA)) {
+               if ((n->depth != DEPTH_UNUSED) &&
+                               !(dst->flags & IR3_REG_SSA)) {
                         int name = dst->num - off;
                         debug_assert(name >= 0);
                         return name;
@@ -472,6 +507,23 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
  
         dst = instr->regs[0];
  
+       /* For indirect dst, take the register assignment from the
+        * fanin and propagate it forward.
+        */
+       if (dst->flags & IR3_REG_RELATIV) {
+               /* NOTE can be grouped, if for example outputs:
+                * for now disable cp if indirect writes
+                */
+               instr_alloc_and_assign(ctx, instr->fanin);
+
+               dst->num += instr->fanin->regs[0]->num;
+               dst->flags &= ~IR3_REG_SSA;
+
+               instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num);
+
+               return;
+       }
+
         /* for instructions w/ fanouts, do the actual register assignment
          * on the group of fanout neighbor nodes and propagate the reg
          * name back up to the texture instruction.
@@ -510,6 +562,33 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
         }
  }
  
+static void instr_assign_array(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+       int name, aid = instr->fi.aid;
+
+       if (ctx->arrays[aid].base == ~0) {
+               int size = instr->regs_count - 1;
+               ctx->arrays[aid].base = alloc_block(ctx, instr, size);
+               ctx->arrays[aid].size = size;
+       }
+
+       name = ctx->arrays[aid].base;
+
+       foreach_ssa_src_n(src, i, instr) {
+               unsigned r = i + 1;
+
+               /* skip address / etc (non real sources): */
+               if (r >= instr->regs_count)
+                       break;
+
+               instr_assign(ctx, src, name);
+               name++;
+       }
+
+}
+
  static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
         struct ir3_instruction *n;
@@ -531,6 +610,16 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  
         ra_dump_list("-------\n", block->head);
  
+       /* first pass, assign arrays: */
+       for (n = block->head; n && !ctx->error; n = n->next) {
+               if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) {
+                       debug_assert(!n->cp.left);  /* don't think this should happen */
+                       ra_dump_instr("ASSIGN ARRAY: ", n);
+                       instr_assign_array(ctx, n);
+                       ra_dump_list("-------\n", block->head);
+               }
+       }
+
         for (n = block->head; n && !ctx->error; n = n->next) {
                 ra_dump_instr("ASSIGN: ", n);
                 instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
@@ -552,6 +641,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
         };
         int ret;
  
+       memset(&ctx.arrays, ~0, sizeof(ctx.arrays));
+
         /* mark dst registers w/ SSA flag so we can see which
          * have been assigned so far:
          * NOTE: we really should set SSA flag consistently on
author	Rob Clark <robclark@freedesktop.org>
	Fri, 23 Jan 2015 20:04:46 +0000 (15:04 -0500)
committer	Rob Clark <robclark@freedesktop.org>
	Sun, 8 Mar 2015 21:42:43 +0000 (17:42 -0400)
src/gallium/drivers/freedreno/ir3/ir3.h		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_compiler.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_cp.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_dump.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_ra.c		patch \| blob \| history