freedreno/ir3: block reshuffling and loops!

author Rob Clark <robdclark@gmail.com>

Tue, 9 Jun 2015 21:17:06 +0000 (17:17 -0400)

committer Rob Clark <robclark@freedesktop.org>

Sun, 21 Jun 2015 11:54:38 +0000 (07:54 -0400)
author Rob Clark <robdclark@gmail.com>
Tue, 9 Jun 2015 21:17:06 +0000 (17:17 -0400)
committer Rob Clark <robclark@freedesktop.org>
Sun, 21 Jun 2015 11:54:38 +0000 (07:54 -0400)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c

index ba5851c6c820ae1285bce078ecf53fb3c46064a7..a166b67d7cfc31389c45a536de6e06b433f057dc 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -80,6 +80,8 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
         shader->noutputs = nout;
         shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
  
+       list_inithead(&shader->block_list);
+
         return shader;
  }
  
@@ -548,7 +550,6 @@ static int (*emit[])(struct ir3_instruction *instr, void *ptr,
  void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
                 uint32_t gpu_id)
  {
-       struct ir3_block *block = shader->block;
         uint32_t *ptr, *dwords;
  
         info->gpu_id        = gpu_id;
@@ -558,8 +559,10 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
         info->instrs_count  = 0;
         info->sizedwords    = 0;
  
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               info->sizedwords += 2;
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       info->sizedwords += 2;
+               }
         }
  
         /* need a integer number of instruction "groups" (sets of 16
@@ -574,12 +577,14 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
  
         ptr = dwords = calloc(4, info->sizedwords);
  
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               int ret = emit[instr->category](instr, dwords, info);
-               if (ret)
-                       goto fail;
-               info->instrs_count += 1 + instr->repeat;
-               dwords += 2;
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       int ret = emit[instr->category](instr, dwords, info);
+                       if (ret)
+                               goto fail;
+                       info->instrs_count += 1 + instr->repeat;
+                       dwords += 2;
+               }
         }
  
         return ptr;
@@ -617,7 +622,12 @@ static void insert_instr(struct ir3_block *block,
  struct ir3_block * ir3_block_create(struct ir3 *shader)
  {
         struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+       static uint32_t serialno = 0;
+       block->serialno = ++serialno;
+#endif
         block->shader = shader;
+       list_inithead(&block->node);
         list_inithead(&block->instr_list);
         return block;
  }
@@ -688,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
  struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
                 int num, int flags)
  {
-       struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+       struct ir3 *shader = instr->block->shader;
+       struct ir3_register *reg = reg_create(shader, num, flags);
  #ifdef DEBUG
         debug_assert(instr->regs_count < instr->regs_max);
  #endif
         instr->regs[instr->regs_count++] = reg;
         return reg;
  }
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+               instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ir3_block_clear_mark(block);
+       }
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+void
+ir3_count_instructions(struct ir3 *ir)
+{
+       unsigned ip = 0;
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       instr->ip = ip++;
+               }
+               block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+               block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+       }
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h

index 95b866988b85839b93ec859770a6626372e6aafe..9c35a763d583da83558b34acd099e068d83a8457 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
                  * before register assignment is done:
                  */
                 IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-               IR3_REG_IA     = 0x4000,   /* meta-input dst is "assigned" */
+               IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+
         } flags;
         union {
                 /* normal registers:
@@ -187,6 +188,7 @@ struct ir3_instruction {
                         char inv;
                         char comp;
                         int  immed;
+                       struct ir3_block *target;
                 } cat0;
                 struct {
                         type_t src_type, dst_type;
@@ -220,14 +222,14 @@ struct ir3_instruction {
                         int aid;
                 } fi;
                 struct {
-                       struct ir3_block *if_block, *else_block;
-               } flow;
+                       /* used to temporarily hold reference to nir_phi_instr
+                        * until we resolve the phi srcs
+                        */
+                       void *nphi;
+               } phi;
                 struct {
                         struct ir3_block *block;
                 } inout;
-
-               /* XXX keep this as big as all other union members! */
-               uint32_t info[3];
         };
  
         /* transient values used during various algorithms: */
@@ -363,16 +365,40 @@ struct ir3 {
         unsigned predicates_count, predicates_sz;
         struct ir3_instruction **predicates;
  
-       struct ir3_block *block;
+       /* List of blocks: */
+       struct list_head block_list;
+
         unsigned heap_idx;
         struct ir3_heap_chunk *chunk;
  };
  
+typedef struct nir_block nir_block;
+
  struct ir3_block {
+       struct list_head node;
         struct ir3 *shader;
-       /* only a single address register: */
-       struct ir3_instruction *address;
-       struct list_head instr_list;
+
+       nir_block *nblock;
+
+       struct list_head instr_list;  /* list of ir3_instruction */
+
+       /* each block has either one or two successors.. in case of
+        * two successors, 'condition' decides which one to follow.
+        * A block preceding an if/else has two successors.
+        */
+       struct ir3_instruction *condition;
+       struct ir3_block *successors[2];
+
+       uint16_t start_ip, end_ip;
+
+       /* used for per-pass extra block data.  Mainly used right
+        * now in RA step to track livein/liveout.
+        */
+       void *bd;
+
+#ifdef DEBUG
+       uint32_t serialno;
+#endif
  };
  
  struct ir3 * ir3_create(struct ir3_compiler *compiler,
@@ -394,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
  struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
                 int num, int flags);
  
-
  static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
  {
         if (instr->flags & IR3_INSTR_MARK)
@@ -403,19 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
         return false;
  }
  
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
-       /* TODO would be nice to drop the instruction array.. for
-        * new compiler, _clear_mark() is all we use it for, and
-        * we could probably manage a linked list instead..
-        *
-        * Also, we'll probably want to mark instructions within
-        * a block, so tracking the list of instrs globally is
-        * unlikely to be what we want.
-        */
-       list_for_each_entry (struct ir3_instruction, instr, &shader->block->instr_list, node)
-               instr->flags &= ~IR3_INSTR_MARK;
-}
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+void ir3_count_instructions(struct ir3 *ir);
  
  static inline int ir3_instr_regno(struct ir3_instruction *instr,
                 struct ir3_register *reg)
@@ -593,6 +609,22 @@ static inline bool reg_gpr(struct ir3_register *r)
         return true;
  }
  
+static inline type_t half_type(type_t type)
+{
+       switch (type) {
+       case TYPE_F32: return TYPE_F16;
+       case TYPE_U32: return TYPE_U16;
+       case TYPE_S32: return TYPE_S16;
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return type;
+       default:
+               assert(0);
+               return ~0;
+       }
+}
+
  /* some cat2 instructions (ie. those which are not float) can embed an
   * immediate:
   */
@@ -837,6 +869,15 @@ ir3_NOP(struct ir3_block *block)
         return ir3_instr_create(block, 0, OPC_NOP);
  }
  
+#define INSTR0(CAT, name)                                                \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create(block, CAT, OPC_##name);                        \
+       return instr;                                                        \
+}
+
  #define INSTR1(CAT, name)                                                \
  static inline struct ir3_instruction *                                   \
  ir3_##name(struct ir3_block *block,                                      \
@@ -880,7 +921,10 @@ ir3_##name(struct ir3_block *block,                                      \
  }
  
  /* cat0 instructions: */
+INSTR0(0, BR);
+INSTR0(0, JUMP);
  INSTR1(0, KILL);
+INSTR0(0, END);
  
  /* cat2 instructions, most 2 src but some 1 src: */
  INSTR2(2, ADD_F)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c

index f62a5ec2b2606996e32fd8226055ab916fbea8ac..4165e2d6aa7e1897dc9d337a6af76d1c1f6580fb 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -48,8 +48,6 @@
  #include "ir3.h"
  
  
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-
  struct ir3_compile {
         struct ir3_compiler *compiler;
  
@@ -62,7 +60,10 @@ struct ir3_compile {
         /* bitmask of which samplers are integer: */
         uint16_t integer_s;
  
-       struct ir3_block *block;
+       struct ir3_block *block;      /* the current block */
+       struct ir3_block *in_block;   /* block created for shader inputs */
+
+       nir_function_impl *impl;
  
         /* For fragment shaders, from the hw perspective the only
          * actual input is r0.xy position register passed to bary.f.
@@ -94,6 +95,11 @@ struct ir3_compile {
          */
         struct hash_table *addr_ht;
  
+       /* maps nir_block to ir3_block, mostly for the purposes of
+        * figuring out the blocks successors
+        */
+       struct hash_table *block_ht;
+
         /* for calculating input/output positions/linkages: */
         unsigned next_inloc;
  
@@ -120,6 +126,9 @@ struct ir3_compile {
  };
  
  
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
+
  static struct nir_shader *to_nir(const struct tgsi_token *tokens)
  {
         struct nir_shader_compiler_options options = {
@@ -148,6 +157,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
  
                 nir_lower_vars_to_ssa(s);
                 nir_lower_alu_to_scalar(s);
+               nir_lower_phis_to_scalar(s);
  
                 progress |= nir_copy_prop(s);
                 progress |= nir_opt_dce(s);
@@ -244,6 +254,8 @@ compile_init(struct ir3_compiler *compiler,
                         _mesa_hash_pointer, _mesa_key_pointer_equal);
         ctx->addr_ht = _mesa_hash_table_create(ctx,
                         _mesa_hash_pointer, _mesa_key_pointer_equal);
+       ctx->block_ht = _mesa_hash_table_create(ctx,
+                       _mesa_hash_pointer, _mesa_key_pointer_equal);
  
         lowered_tokens = lower_tgsi(ctx, tokens, so);
         if (!lowered_tokens)
@@ -287,33 +299,206 @@ compile_free(struct ir3_compile *ctx)
         ralloc_free(ctx);
  }
  
-
+/* global per-array information: */
  struct ir3_array {
         unsigned length, aid;
+};
+
+/* per-block array state: */
+struct ir3_array_value {
+       /* TODO drop length/aid, and just have ptr back to ir3_array */
+       unsigned length, aid;
+       /* initial array element values are phi's, other than for the
+        * entry block.  The phi src's get added later in a resolve step
+        * after we have visited all the blocks, to account for back
+        * edges in the cfg.
+        */
+       struct ir3_instruction **phis;
+       /* current array element values (as block is processed).  When
+        * the array phi's are resolved, it will contain the array state
+        * at exit of block, so successor blocks can use it to add their
+        * phi srcs.
+        */
         struct ir3_instruction *arr[];
  };
  
+/* track array assignments per basic block.  When an array is read
+ * outside of the same basic block, we can use NIR's dominance-frontier
+ * information to figure out where phi nodes are needed.
+ */
+struct ir3_nir_block_data {
+       unsigned foo;
+       /* indexed by array-id (aid): */
+       struct ir3_array_value *arrs[];
+};
+
+static struct ir3_nir_block_data *
+get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
+{
+       if (!block->bd) {
+               struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
+                               ((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
+               block->bd = bd;
+       }
+       return block->bd;
+}
+
  static void
  declare_var(struct ir3_compile *ctx, nir_variable *var)
  {
         unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
-       struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
-                       (length * sizeof(arr->arr[0])));
+       struct ir3_array *arr = ralloc(ctx, struct ir3_array);
         arr->length = length;
         arr->aid = ++ctx->num_arrays;
-       /* Some shaders end up reading array elements without first writing..
-        * so initialize things to prevent null instr ptrs later:
-        */
-       for (unsigned i = 0; i < length; i++)
-               arr->arr[i] = create_immed(ctx->block, 0);
         _mesa_hash_table_insert(ctx->var_ht, var, arr);
  }
  
-static struct ir3_array *
+static nir_block *
+nir_block_pred(nir_block *block)
+{
+       assert(block->predecessors->entries < 2);
+       if (block->predecessors->entries == 0)
+               return NULL;
+       return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+}
+
+static struct ir3_array_value *
  get_var(struct ir3_compile *ctx, nir_variable *var)
  {
         struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-       return entry->data;
+       struct ir3_block *block = ctx->block;
+       struct ir3_nir_block_data *bd = get_block_data(ctx, block);
+       struct ir3_array *arr = entry->data;
+
+       if (!bd->arrs[arr->aid]) {
+               struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
+                               (arr->length * sizeof(av->arr[0])));
+               struct ir3_array_value *defn = NULL;
+               nir_block *pred_block;
+
+               av->length = arr->length;
+               av->aid = arr->aid;
+
+               /* For loops, we have to consider that we have not visited some
+                * of the blocks who should feed into the phi (ie. back-edges in
+                * the cfg).. for example:
+                *
+                *   loop {
+                *      block { load_var; ... }
+                *      if then block {} else block {}
+                *      block { store_var; ... }
+                *      if then block {} else block {}
+                *      block {...}
+                *   }
+                *
+                * We can skip the phi if we can chase the block predecessors
+                * until finding the block previously defining the array without
+                * crossing a block that has more than one predecessor.
+                *
+                * Otherwise create phi's and resolve them as a post-pass after
+                * all the blocks have been visited (to handle back-edges).
+                */
+
+               for (pred_block = block->nblock;
+                               pred_block && (pred_block->predecessors->entries < 2) && !defn;
+                               pred_block = nir_block_pred(pred_block)) {
+                       struct ir3_block *pblock = get_block(ctx, pred_block);
+                       struct ir3_nir_block_data *pbd = pblock->bd;
+                       if (!pbd)
+                               continue;
+                       defn = pbd->arrs[arr->aid];
+               }
+
+               if (defn) {
+                       /* only one possible definer: */
+                       for (unsigned i = 0; i < arr->length; i++)
+                               av->arr[i] = defn->arr[i];
+               } else if (pred_block) {
+                       /* not the first block, and multiple potential definers: */
+                       av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
+
+                       for (unsigned i = 0; i < arr->length; i++) {
+                               struct ir3_instruction *phi;
+
+                               phi = ir3_instr_create2(block, -1, OPC_META_PHI,
+                                               1 + ctx->impl->num_blocks);
+                               ir3_reg_create(phi, 0, 0);         /* dst */
+
+                               /* phi's should go at head of block: */
+                               list_delinit(&phi->node);
+                               list_add(&phi->node, &block->instr_list);
+
+                               av->phis[i] = av->arr[i] = phi;
+                       }
+               } else {
+                       /* Some shaders end up reading array elements without
+                        * first writing.. so initialize things to prevent null
+                        * instr ptrs later:
+                        */
+                       for (unsigned i = 0; i < arr->length; i++)
+                               av->arr[i] = create_immed(block, 0);
+               }
+
+               bd->arrs[arr->aid] = av;
+       }
+
+       return bd->arrs[arr->aid];
+}
+
+static void
+add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
+               struct ir3_array_value *av, BITSET_WORD *visited)
+{
+       struct ir3_block *block;
+       struct ir3_nir_block_data *bd;
+
+       if (BITSET_TEST(visited, nblock->index))
+               return;
+
+       BITSET_SET(visited, nblock->index);
+
+       block = get_block(ctx, nblock);
+       bd = block->bd;
+
+       if (bd && bd->arrs[av->aid]) {
+               struct ir3_array_value *dav = bd->arrs[av->aid];
+               for (unsigned i = 0; i < av->length; i++) {
+                       ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
+                                       dav->arr[i];
+               }
+       } else {
+               /* didn't find defn, recurse predecessors: */
+               struct set_entry *entry;
+               set_foreach(nblock->predecessors, entry) {
+                       add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+               }
+       }
+}
+
+static void
+resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+       struct ir3_nir_block_data *bd = block->bd;
+       unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
+
+       if (!bd)
+               return;
+
+       /* TODO use nir dom_frontier to help us with this? */
+
+       for (unsigned i = 1; i <= ctx->num_arrays; i++) {
+               struct ir3_array_value *av = bd->arrs[i];
+               BITSET_WORD visited[bitset_words];
+               struct set_entry *entry;
+
+               if (!(av && av->phis))
+                       continue;
+
+               memset(visited, 0, sizeof(visited));
+               set_foreach(block->nblock->predecessors, entry) {
+                       add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+               }
+       }
  }
  
  /* allocate a n element value array (to be populated by caller) and
@@ -416,6 +601,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
         return addr;
  }
  
+static struct ir3_instruction *
+get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *cond;
+
+       /* NOTE: only cmps.*.* can write p0.x: */
+       cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+       cond->cat2.condition = IR3_COND_NE;
+
+       /* condition always goes in predicate register: */
+       cond->regs[0]->num = regid(REG_P0, 0);
+
+       return cond;
+}
+
  static struct ir3_instruction *
  create_uniform(struct ir3_compile *ctx, unsigned n)
  {
@@ -1029,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
  {
         nir_deref_var *dvar = intr->variables[0];
         nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-       struct ir3_array *arr = get_var(ctx, dvar->var);
+       struct ir3_array_value *arr = get_var(ctx, dvar->var);
  
         compile_assert(ctx, dvar->deref.child &&
                 (dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1069,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
  {
         nir_deref_var *dvar = intr->variables[0];
         nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-       struct ir3_array *arr = get_var(ctx, dvar->var);
+       struct ir3_array_value *arr = get_var(ctx, dvar->var);
         struct ir3_instruction **src;
  
         compile_assert(ctx, dvar->deref.child &&
@@ -1245,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
                         cond = create_immed(b, 1);
                 }
  
+               /* NOTE: only cmps.*.* can write p0.x: */
                 cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
                 cond->cat2.condition = IR3_COND_NE;
  
@@ -1557,6 +1759,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
         }
  }
  
+static void
+emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
+{
+       struct ir3_instruction *phi, **dst;
+
+       /* NOTE: phi's should be lowered to scalar at this point */
+       compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+
+       dst = get_dst(ctx, &nphi->dest, 1);
+
+       phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+                       1 + exec_list_length(&nphi->srcs));
+       ir3_reg_create(phi, 0, 0);         /* dst */
+       phi->phi.nphi = nphi;
+
+       dst[0] = phi;
+}
+
+/* phi instructions are left partially constructed.  We don't resolve
+ * their srcs until the end of the block, since (eg. loops) one of
+ * the phi's srcs might be defined after the phi due to back edges in
+ * the CFG.
+ */
+static void
+resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               nir_phi_instr *nphi;
+
+               /* phi's only come at start of block: */
+               if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+                       break;
+
+               if (!instr->phi.nphi)
+                       break;
+
+               nphi = instr->phi.nphi;
+               instr->phi.nphi = NULL;
+
+               foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
+                       struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+                       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+               }
+       }
+
+       resolve_array_phis(ctx, block);
+}
+
+static void
+emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
+{
+       switch (jump->type) {
+       case nir_jump_break:
+       case nir_jump_continue:
+               /* I *think* we can simply just ignore this, and use the
+                * successor block link to figure out where we need to
+                * jump to for break/continue
+                */
+               break;
+       default:
+               compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+               break;
+       }
+}
+
  static void
  emit_instr(struct ir3_compile *ctx, nir_instr *instr)
  {
@@ -1590,44 +1857,111 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
                 }
                 break;
         }
-       case nir_instr_type_call:
-       case nir_instr_type_jump:
         case nir_instr_type_phi:
+               emit_phi(ctx, nir_instr_as_phi(instr));
+               break;
+       case nir_instr_type_jump:
+               emit_jump(ctx, nir_instr_as_jump(instr));
+               break;
+       case nir_instr_type_call:
         case nir_instr_type_parallel_copy:
                 compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
                 break;
         }
  }
  
+static struct ir3_block *
+get_block(struct ir3_compile *ctx, nir_block *nblock)
+{
+       struct ir3_block *block;
+       struct hash_entry *entry;
+       entry = _mesa_hash_table_search(ctx->block_ht, nblock);
+       if (entry)
+               return entry->data;
+
+       block = ir3_block_create(ctx->ir);
+       block->nblock = nblock;
+       _mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+       return block;
+}
+
  static void
-emit_block(struct ir3_compile *ctx, nir_block *block)
+emit_block(struct ir3_compile *ctx, nir_block *nblock)
  {
-       nir_foreach_instr(block, instr) {
+       struct ir3_block *block = get_block(ctx, nblock);
+
+       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+               if (nblock->successors[i]) {
+                       block->successors[i] =
+                               get_block(ctx, nblock->successors[i]);
+               }
+       }
+
+       ctx->block = block;
+       list_addtail(&block->node, &ctx->ir->block_list);
+
+       nir_foreach_instr(nblock, instr) {
                 emit_instr(ctx, instr);
                 if (ctx->error)
                         return;
         }
  }
  
+static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
+
  static void
-emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+emit_if(struct ir3_compile *ctx, nir_if *nif)
+{
+       struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+       ctx->block->condition =
+               get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+       emit_cf_list(ctx, &nif->then_list);
+       emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
+{
+       emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
  {
-       foreach_list_typed(nir_cf_node, node, node, &impl->body) {
+       foreach_list_typed(nir_cf_node, node, node, list) {
                 switch (node->type) {
                 case nir_cf_node_block:
                         emit_block(ctx, nir_cf_node_as_block(node));
                         break;
                 case nir_cf_node_if:
+                       emit_if(ctx, nir_cf_node_as_if(node));
+                       break;
                 case nir_cf_node_loop:
+                       emit_loop(ctx, nir_cf_node_as_loop(node));
+                       break;
                 case nir_cf_node_function:
                         compile_error(ctx, "TODO\n");
                         break;
                 }
-               if (ctx->error)
-                       return;
         }
  }
  
+static void
+emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+{
+       emit_cf_list(ctx, &impl->body);
+       emit_block(ctx, impl->end_block);
+
+       /* at this point, we should have a single empty block,
+        * into which we emit the 'end' instruction.
+        */
+       compile_assert(ctx, list_empty(&ctx->block->instr_list));
+       ir3_END(ctx->block);
+}
+
  static void
  setup_input(struct ir3_compile *ctx, nir_variable *in)
  {
@@ -1787,8 +2121,19 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
  static void
  emit_instructions(struct ir3_compile *ctx)
  {
-       unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
-       unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
+       unsigned ninputs, noutputs;
+       nir_function_impl *fxn = NULL;
+
+       /* Find the main function: */
+       nir_foreach_overload(ctx->s, overload) {
+               compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
+               compile_assert(ctx, overload->impl);
+               fxn = overload->impl;
+               break;
+       }
+
+       ninputs  = exec_list_length(&ctx->s->inputs) * 4;
+       noutputs = exec_list_length(&ctx->s->outputs) * 4;
  
         /* we need to allocate big enough outputs array so that
          * we can stuff the kill's at the end.  Likewise for vtx
@@ -1801,8 +2146,11 @@ emit_instructions(struct ir3_compile *ctx)
         }
  
         ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
-       ctx->block = ir3_block_create(ctx->ir);
-       ctx->ir->block = ctx->block;
+
+       /* Create inputs in first block: */
+       ctx->block = get_block(ctx, fxn->start_block);
+       ctx->in_block = ctx->block;
+       list_addtail(&ctx->block->node, &ctx->ir->block_list);
  
         if (ctx->so->type == SHADER_FRAGMENT) {
                 ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
@@ -1838,13 +2186,12 @@ emit_instructions(struct ir3_compile *ctx)
                 declare_var(ctx, var);
         }
  
-       /* Find the main function and emit the body: */
-       nir_foreach_overload(ctx->s, overload) {
-               compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
-               compile_assert(ctx, overload->impl);
-               emit_function(ctx, overload->impl);
-               if (ctx->error)
-                       return;
+       /* And emit the body: */
+       ctx->impl = fxn;
+       emit_function(ctx, fxn);
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               resolve_phis(ctx, block);
         }
  }
  
@@ -1906,13 +2253,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
         so->pos_regid = regid;
  
         /* r0.x */
-       instr = create_input(ctx->block, NULL, ir->ninputs);
+       instr = create_input(ctx->in_block, NULL, ir->ninputs);
         instr->regs[0]->num = regid++;
         inputs[ir->ninputs++] = instr;
         ctx->frag_pos->regs[1]->instr = instr;
  
         /* r0.y */
-       instr = create_input(ctx->block, NULL, ir->ninputs);
+       instr = create_input(ctx->in_block, NULL, ir->ninputs);
         instr->regs[0]->num = regid++;
         inputs[ir->ninputs++] = instr;
         ctx->frag_pos->regs[2]->instr = instr;
@@ -1998,6 +2345,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
                                 out = out->regs[1]->instr;
                                 out->regs[0]->flags |= IR3_REG_HALF;
                         }
+
+                       if (out->category == 1) {
+                               out->cat1.dst_type = half_type(out->cat1.dst_type);
+                       }
                 }
         }
  
@@ -2058,6 +2409,11 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
  
         ir3_legalize(ir, &so->has_samp, &max_bary);
  
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               printf("AFTER LEGALIZE:\n");
+               ir3_print(ir);
+       }
+
         /* fixup input/outputs: */
         for (i = 0; i < so->outputs_count; i++) {
                 so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c

index a477bd4b2374f806e3399fd3ef58f0763f728709..8c7c80f7aae37b97453663e816ee7954f02129a8 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
                 /* TODO: remove this hack: */
                 if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
                         return false;
+               /* TODO: we currently don't handle left/right neighbors
+                * very well when inserting parallel-copies into phi..
+                * to avoid problems don't eliminate a mov coming out
+                * of phi..
+                */
+               if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+                       return false;
                 return true;
         }
         return false;
@@ -390,7 +397,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
  void
  ir3_cp(struct ir3 *ir)
  {
-       ir3_clear_mark(ir->block->shader);
+       ir3_clear_mark(ir);
  
         for (unsigned i = 0; i < ir->noutputs; i++) {
                 if (ir->outputs[i]) {
@@ -400,4 +407,9 @@ ir3_cp(struct ir3 *ir)
                         ir->outputs[i] = out;
                 }
         }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               if (block->condition)
+                       block->condition = instr_cp(block->condition, NULL);
+       }
  }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c

index 6fc8b1762fffced2fd17469acca68570fece6c2e..3a108243479da0379f1b4eb4aa80828194bc8335 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -134,6 +134,8 @@ remove_unused_by_block(struct ir3_block *block)
  {
         list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
                 if (!ir3_instr_check_mark(instr)) {
+                       if (is_flow(instr) && (instr->opc == OPC_END))
+                               continue;
                         /* mark it, in case it is input, so we can
                          * remove unused inputs:
                          */
@@ -149,13 +151,21 @@ ir3_depth(struct ir3 *ir)
  {
         unsigned i;
  
-       ir3_clear_mark(ir->block->shader);
+       ir3_clear_mark(ir);
         for (i = 0; i < ir->noutputs; i++)
                 if (ir->outputs[i])
                         ir3_instr_depth(ir->outputs[i]);
  
+       /* We also need to account for if-condition: */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               if (block->condition)
+                       ir3_instr_depth(block->condition);
+       }
+
         /* mark un-used instructions: */
-       remove_unused_by_block(ir->block);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               remove_unused_by_block(block);
+       }
  
         /* cleanup unused inputs: */
         for (i = 0; i < ir->ninputs; i++) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c

index 1fe09cc11e5fe62b9637f86453545405c75a7641..70d9b08e01934c7db93ac4743d75aedd0f1078bb 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -116,6 +116,10 @@ restart:
                         conflict = conflicts(instr->cp.left, left) ||
                                 conflicts(instr->cp.right, right);
  
+                       /* RA can't yet deal very well w/ group'd phi's: */
+                       if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+                               conflict = true;
+
                         /* we also can't have an instr twice in the group: */
                         for (j = i + 1; (j < n) && !conflict; j++)
                                 if (ops->get(arr, j) == instr)
@@ -226,7 +230,6 @@ find_neighbors(struct ir3 *ir)
         for (i = 0; i < ir->noutputs; i += 4)
                 group_n(&arr_ops_out, &ir->outputs[i], 4);
  
-
         for (i = 0; i < ir->noutputs; i++) {
                 if (ir->outputs[i]) {
                         struct ir3_instruction *instr = ir->outputs[i];
@@ -238,6 +241,6 @@ find_neighbors(struct ir3 *ir)
  void
  ir3_group(struct ir3 *ir)
  {
-       ir3_clear_mark(ir->block->shader);
+       ir3_clear_mark(ir);
         find_neighbors(ir);
  }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c

index 34055f4c612a8caf12f36703b10dbb24e2c2ca08..f4a4223ae1736f88db66e7b98b4223725576bbb6 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -42,15 +42,28 @@
   */
  
  struct ir3_legalize_ctx {
-       struct ir3_block *block;
         bool has_samp;
         int max_bary;
  };
  
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union
+ * of all possible program paths.  For stopping condition, we
+ * want to stop when the pair of <pred-block, current-block> has
+ * been visited already.
+ *
+ * XXX is that completely true?  We could have different needs_xyz
+ * flags set depending on path leading to pred-block.. we could
+ * do *most* of this based on chasing src instructions ptrs (and
+ * following all phi srcs).. except the write-after-read hazzard.
+ *
+ * For now we just set ss/sy flag on first instruction on block,
+ * and handle everything within the block as before.
+ */
+
  static void
-legalize(struct ir3_legalize_ctx *ctx)
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  {
-       struct ir3_block *block = ctx->block;
         struct ir3_instruction *last_input = NULL;
         struct ir3_instruction *last_rel = NULL;
         struct list_head instr_list;
@@ -203,6 +216,7 @@ legalize(struct ir3_legalize_ctx *ctx)
                         ir3_reg_create(baryf, regid(0, 0), 0);
  
                         /* insert the dummy bary.f after last_input: */
+                       list_delinit(&baryf->node);
                         list_add(&baryf->node, &last_input->node);
  
                         last_input = baryf;
@@ -213,23 +227,177 @@ legalize(struct ir3_legalize_ctx *ctx)
         if (last_rel)
                 last_rel->flags |= IR3_INSTR_UL;
  
-       /* create/add 'end' instruction: */
-       ir3_instr_create(block, 0, OPC_END);
-
         list_first_entry(&block->instr_list, struct ir3_instruction, node)
                 ->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
  }
  
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *     ...
+ *     0029:021: mov.s32s32 r62.x, r1.y
+ *     0082:022: br !p0.x, target=block5
+ *     0083:023: br p0.x, target=block4
+ *     // succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *     0084:024: jump, target=block6
+ *     // succs: block6;
+ *   }
+ *   block5 {
+ *     0085:025: jump, target=block7
+ *     // succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+       /* special case for last block: */
+       if (!block->successors[0])
+               return block;
+
+       /* NOTE that we may or may not have inserted the jump
+        * in the target block yet, so conditions to resolve
+        * the dest to the dest block's successor are:
+        *
+        *   (1) successor[1] == NULL &&
+        *   (2) (block-is-empty || only-instr-is-jump)
+        */
+       if (block->successors[1] == NULL) {
+               if (list_empty(&block->instr_list)) {
+                       return block->successors[0];
+               } else if (list_length(&block->instr_list) == 1) {
+                       struct ir3_instruction *instr = list_first_entry(
+                                       &block->instr_list, struct ir3_instruction, node);
+                       if (is_flow(instr) && (instr->opc == OPC_JUMP))
+                               return block->successors[0];
+               }
+       }
+       return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+       struct ir3_block *tblock =
+               resolve_dest_block(instr->cat0.target);
+       struct ir3_instruction *target;
+
+       if (tblock != instr->cat0.target) {
+               list_delinit(&instr->cat0.target->node);
+               instr->cat0.target = tblock;
+               return true;
+       }
+
+       target = list_first_entry(&tblock->instr_list,
+                               struct ir3_instruction, node);
+
+       if ((!target) || (target->ip == (instr->ip + 1))) {
+               list_delinit(&instr->node);
+               return true;
+       } else {
+               instr->cat0.immed =
+                       (int)target->ip - (int)instr->ip;
+       }
+       return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+                       if (is_flow(instr) && instr->cat0.target)
+                               if (resolve_jump(instr))
+                                       return true;
+
+       return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       if (is_flow(instr) && instr->cat0.target) {
+                               struct ir3_instruction *target =
+                                       list_first_entry(&instr->cat0.target->instr_list,
+                                                       struct ir3_instruction, node);
+                               target->flags |= IR3_INSTR_JP;
+                       }
+               }
+       }
+}
+
  void
  ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
  {
         struct ir3_legalize_ctx ctx = {
-                       .block = ir->block,
                         .max_bary = -1,
         };
  
-       legalize(&ctx);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               legalize_block(&ctx, block);
+       }
  
         *has_samp = ctx.has_samp;
         *max_bary = ctx.max_bary;
+
+       do {
+               ir3_count_instructions(ir);
+       } while(resolve_jumps(ir));
+
+       mark_convergence_points(ir);
  }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c

index 965c834b8aae73234ecb8ef11a1665b597a02a90..f377982dd5e180ae9f8bc72fe5b008d778567f6d 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -137,6 +137,16 @@ tab(int lvl)
                 printf("\t");
  }
  
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+       return block->serialno;
+#else
+       return (uint32_t)(uint64_t)block;
+#endif
+}
+
  static void
  print_instr(struct ir3_instruction *instr, int lvl)
  {
@@ -173,6 +183,14 @@ print_instr(struct ir3_instruction *instr, int lvl)
                 }
         }
  
+       if (is_flow(instr) && instr->cat0.target) {
+               /* the predicate register src is implied: */
+               if (instr->opc == OPC_BR) {
+                       printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+               }
+               printf(", target=block%u", block_id(instr->cat0.target));
+       }
+
         printf("\n");
  }
  
@@ -184,19 +202,31 @@ void ir3_print_instr(struct ir3_instruction *instr)
  static void
  print_block(struct ir3_block *block, int lvl)
  {
-       tab(lvl); printf("block {\n");
+       tab(lvl); printf("block%u {\n", block_id(block));
         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
                 print_instr(instr, lvl+1);
         }
+       if (block->successors[1]) {
+               /* leading into if/else: */
+               tab(lvl+1);
+               printf("/* succs: if _[");
+               print_instr_name(block->condition);
+               printf("] block%u; else block%u; */\n",
+                               block_id(block->successors[0]),
+                               block_id(block->successors[1]));
+       } else if (block->successors[0]) {
+               tab(lvl+1);
+               printf("/* succs: block%u; */\n",
+                               block_id(block->successors[0]));
+       }
         tab(lvl); printf("}\n");
  }
  
  void
  ir3_print(struct ir3 *ir)
  {
-       struct ir3_block *block = ir->block;
-
-       print_block(block, 0);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+               print_block(block, 0);
  
         for (unsigned i = 0; i < ir->noutputs; i++) {
                 if (!ir->outputs[i])
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c

index 394c63f646deafb9746073d2924b5d40d83ad67e..359cd9a0d5d27d86712ae9965406b8b51788c0aa 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -29,6 +29,7 @@
  #include "util/u_math.h"
  #include "util/register_allocate.h"
  #include "util/ralloc.h"
+#include "util/bitset.h"
  
  #include "ir3.h"
  #include "ir3_compiler.h"
@@ -255,6 +256,14 @@ struct ir3_ra_ctx {
         unsigned *def, *use;     /* def/use table */
  };
  
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+       BITSET_WORD *def;        /* variables defined before used in block */
+       BITSET_WORD *use;        /* variables used before defined in block */
+       BITSET_WORD *livein;     /* which defs reach entry point of block */
+       BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
  static bool
  is_half(struct ir3_instruction *instr)
  {
@@ -369,7 +378,39 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
                         *sz = util_last_bit(instr->regs[0]->wrmask);
                 }
                 *off = 0;
-               return instr;
+               d = instr;
+       }
+
+       if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+               struct ir3_instruction *phi = d->regs[0]->instr;
+               struct ir3_instruction *dd;
+               int dsz, doff;
+
+               dd = get_definer(phi, &dsz, &doff);
+
+               *sz = MAX2(*sz, dsz);
+               *off = doff;
+
+               if (dd->ip < d->ip) {
+                       d = dd;
+               }
+       }
+
+       if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+               /* we have already inserted parallel-copies into
+                * the phi, so we don't need to chase definers
+                */
+               struct ir3_register *src;
+
+               /* note: don't use foreach_ssa_src as this gets called once
+                * while assigning regs (which clears SSA flag)
+                */
+               foreach_src(src, d) {
+                       if (!src->instr)
+                               continue;
+                       if (src->instr->ip < d->ip)
+                               d = src->instr;
+               }
         }
  
         if (is_meta(d) && (d->opc == OPC_META_FO)) {
@@ -395,14 +436,12 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
  static void
  ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               instr->ip = ctx->instr_cnt++;
-       }
-
         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
                 struct ir3_instruction *defn;
                 int cls, sz, off;
  
+               ctx->instr_cnt++;
+
                 if (instr->regs_count == 0)
                         continue;
  
@@ -431,8 +470,11 @@ static void
  ra_init(struct ir3_ra_ctx *ctx)
  {
         ir3_clear_mark(ctx->ir);
+       ir3_count_instructions(ctx->ir);
  
-       ra_block_name_instructions(ctx, ctx->ir->block);
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_name_instructions(ctx, block);
+       }
  
         /* figure out the base register name for each class.  The
          * actual ra name is class_base[cls] + instr->name;
@@ -448,6 +490,16 @@ ra_init(struct ir3_ra_ctx *ctx)
         ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
  }
  
+static unsigned
+ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+       unsigned name;
+       debug_assert(cls >= 0);
+       name = ctx->class_base[cls] + defn->name;
+       debug_assert(name < ctx->alloc_count);
+       return name;
+}
+
  static void
  ra_destroy(struct ir3_ra_ctx *ctx)
  {
@@ -457,6 +509,18 @@ ra_destroy(struct ir3_ra_ctx *ctx)
  static void
  ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
+       struct ir3_ra_block_data *bd;
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+       bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+       bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+       block->bd = bd;
+
         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
                 struct ir3_instruction *src;
  
@@ -474,7 +538,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                  * fanin: used to collect values from lower class and assemble
                  *     them together into a higher class, for example arguments
                  *     to texture sample instructions;  We consider these to be
-                *     defined at the fanin node.
+                *     defined at the earliest fanin source.
+                *
+                * phi: used to merge values from different flow control paths
+                *     to the same reg.  Consider defined at earliest phi src,
+                *     and update all the other phi src's (which may come later
+                *     in the program) as users to extend the var's live range.
+                *
+                * Most of this, other than phi, is completely handled in the
+                * get_definer() helper.
                  *
                  * In either case, we trace the instruction back to the original
                  * definer and consider that as the def/use ip.
@@ -491,11 +563,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                                  */
                                 cls = size_to_class(sz, is_half(defn));
                                 if (cls >= 0) {
-                                       unsigned name = ctx->class_base[cls] + defn->name;
+                                       unsigned name = ra_name(ctx, cls, defn);
+
                                         ctx->def[name] = defn->ip;
                                         ctx->use[name] = defn->ip;
  
-                                       debug_assert(name < ctx->alloc_count);
+                                       /* since we are in SSA at this point: */
+                                       debug_assert(!BITSET_TEST(bd->use, name));
+
+                                       BITSET_SET(bd->def, name);
  
                                         if (is_half(defn)) {
                                                 ra_set_node_class(ctx->g, name,
@@ -504,6 +580,24 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                                                 ra_set_node_class(ctx->g, name,
                                                                 ctx->set->classes[cls]);
                                         }
+
+                                       /* extend the live range for phi srcs, which may come
+                                        * from the bottom of the loop
+                                        */
+                                       if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+                                               struct ir3_instruction *phi = defn->regs[0]->instr;
+                                               foreach_ssa_src(src, phi) {
+                                                       /* if src is after phi, then we need to extend
+                                                        * the liverange to the end of src's block:
+                                                        */
+                                                       if (src->ip > phi->ip) {
+                                                               struct ir3_instruction *last =
+                                                                       list_last_entry(&src->block->instr_list,
+                                                                               struct ir3_instruction, node);
+                                                               ctx->use[name] = MAX2(ctx->use[name], last->ip);
+                                                       }
+                                               }
+                                       }
                                 }
                         }
                 }
@@ -516,12 +610,59 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                                 srcdefn = get_definer(src, &sz, &off);
                                 cls = size_to_class(sz, is_half(srcdefn));
                                 if (cls >= 0) {
-                                       unsigned name = ctx->class_base[cls] + srcdefn->name;
-                                       ctx->use[name] = instr->ip;
+                                       unsigned name = ra_name(ctx, cls, srcdefn);
+                                       ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+                                       if (!BITSET_TEST(bd->def, name))
+                                               BITSET_SET(bd->use, name);
+                               }
+                       }
+               }
+       }
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+       bool progress = false;
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->bd;
+
+               /* update livein: */
+               for (unsigned i = 0; i < bitset_words; i++) {
+                       BITSET_WORD new_livein =
+                               (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+                       if (new_livein & ~bd->livein[i]) {
+                               bd->livein[i] |= new_livein;
+                               progress = true;
+                       }
+               }
+
+               /* update liveout: */
+               for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+                       struct ir3_block *succ = block->successors[j];
+                       struct ir3_ra_block_data *succ_bd;
+
+                       if (!succ)
+                               continue;
+
+                       succ_bd = succ->bd;
+
+                       for (unsigned i = 0; i < bitset_words; i++) {
+                               BITSET_WORD new_liveout =
+                                       (succ_bd->livein[i] & ~bd->liveout[i]);
+
+                               if (new_liveout) {
+                                       bd->liveout[i] |= new_liveout;
+                                       progress = true;
                                 }
                         }
                 }
         }
+
+       return progress;
  }
  
  static void
@@ -529,7 +670,34 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
  {
         struct ir3 *ir = ctx->ir;
  
-       ra_block_compute_live_ranges(ctx, ctx->ir->block);
+       /* compute live ranges (use/def) on a block level, also updating
+        * block's def/use bitmasks (used below to calculate per-block
+        * livein/liveout):
+        */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ra_block_compute_live_ranges(ctx, block);
+       }
+
+       /* update per-block livein/liveout: */
+       while (ra_compute_livein_liveout(ctx)) {}
+
+       /* extend start/end ranges based on livein/liveout info from cfg: */
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->bd;
+
+               for (unsigned i = 0; i < bitset_words; i++) {
+                       if (BITSET_TEST(bd->livein, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+                       }
+
+                       if (BITSET_TEST(bd->liveout, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+                       }
+               }
+       }
  
         /* need to fix things up to keep outputs live: */
         for (unsigned i = 0; i < ir->noutputs; i++) {
@@ -540,7 +708,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
                 defn = get_definer(instr, &sz, &off);
                 cls = size_to_class(sz, is_half(defn));
                 if (cls >= 0) {
-                       unsigned name = ctx->class_base[cls] + defn->name;
+                       unsigned name = ra_name(ctx, cls, defn);
                         ctx->use[name] = ctx->instr_cnt;
                 }
         }
@@ -555,23 +723,6 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
         }
  }
  
-static type_t half_type(type_t type)
-{
-       switch (type) {
-       case TYPE_F32: return TYPE_F16;
-       case TYPE_U32: return TYPE_U16;
-       case TYPE_S32: return TYPE_S16;
-       /* instructions may already be fixed up: */
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return type;
-       default:
-               assert(0);
-               return ~0;
-       }
-}
-
  /* some instructions need fix-up if dst register is half precision: */
  static void fixup_half_instr_dst(struct ir3_instruction *instr)
  {
@@ -633,7 +784,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
         defn = get_definer(instr, &sz, &off);
         cls = size_to_class(sz, is_half(defn));
         if (cls >= 0) {
-               unsigned name = ctx->class_base[cls] + defn->name;
+               unsigned name = ra_name(ctx, cls, defn);
                 unsigned r = ra_get_node_reg(ctx->g, name);
                 unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
  
@@ -641,7 +792,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
                         num += reg->offset;
  
                 reg->num = num;
-               reg->flags &= ~IR3_REG_SSA;
+               reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
  
                 if (is_half(defn))
                         reg->flags |= IR3_REG_HALF;
@@ -686,8 +837,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
                 unsigned i = 0, j;
                 if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
                         struct ir3_instruction *instr = ir->inputs[i];
-                       unsigned cls = size_to_class(1, true);
-                       unsigned name = ctx->class_base[cls] + instr->name;
+                       int cls = size_to_class(1, true);
+                       unsigned name = ra_name(ctx, cls, instr);
                         unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
  
                         /* if we have frag_face, it gets hr0.x */
@@ -706,8 +857,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
                                         unsigned name, reg;
  
                                         cls = size_to_class(sz, is_half(defn));
-                                       debug_assert(cls >= 0);
-                                       name = ctx->class_base[cls] + defn->name;
+                                       name = ra_name(ctx, cls, defn);
                                         reg = ctx->set->gpr_to_ra_reg[cls][j];
  
                                         ra_set_node_reg(ctx->g, name, reg);
@@ -720,7 +870,9 @@ ra_alloc(struct ir3_ra_ctx *ctx)
         if (!ra_allocate(ctx->g))
                 return -1;
  
-       ra_block_alloc(ctx, ctx->ir->block);
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_alloc(ctx, block);
+       }
  
         return 0;
  }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c

index 0d404a8358370f59590509e25c7194828ca0bc8f..49a4426d163cf9d8aea6b24e3eef3fa13df8ab77 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -205,6 +205,16 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         struct ir3_instruction *src;
         unsigned delay = 0;
  
+       /* Phi instructions can have a dependency on something not
+        * scheduled yet (for ex, loops).  But OTOH we don't really
+        * care.  By definition phi's should appear at the top of
+        * the block, and it's sources should be values from the
+        * previously executing block, so they are always ready to
+        * be scheduled:
+        */
+       if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+               return 0;
+
         foreach_ssa_src(src, instr) {
                 /* if dependency not scheduled, we aren't ready yet: */
                 if (!is_scheduled(src))
@@ -422,13 +432,87 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                         }
                 }
         }
+
+       /* And lastly, insert branch/jump instructions to take us to
+        * the next block.  Later we'll strip back out the branches
+        * that simply jump to next instruction.
+        */
+       if (block->successors[1]) {
+               /* if/else, conditional branches to "then" or "else": */
+               struct ir3_instruction *br;
+               unsigned delay = 6;
+
+               debug_assert(ctx->pred);
+               debug_assert(block->condition);
+
+               delay -= distance(ctx, ctx->pred, delay);
+
+               while (delay > 0) {
+                       ir3_NOP(block);
+                       delay--;
+               }
+
+               /* create "else" branch first (since "then" block should
+                * frequently/always end up being a fall-thru):
+                */
+               br = ir3_BR(block);
+               br->cat0.inv = true;
+               br->cat0.target = block->successors[1];
+
+               /* NOTE: we have to hard code delay of 6 above, since
+                * we want to insert the nop's before constructing the
+                * branch.  Throw in an assert so we notice if this
+                * ever breaks on future generation:
+                */
+               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+               br = ir3_BR(block);
+               br->cat0.target = block->successors[0];
+
+       } else if (block->successors[0]) {
+               /* otherwise unconditional jump to next block: */
+               struct ir3_instruction *jmp;
+
+               jmp = ir3_JUMP(block);
+               jmp->cat0.target = block->successors[0];
+       }
+
+       /* NOTE: if we kept track of the predecessors, we could do a better
+        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+        * Note that as we eliminate blocks which contain only an unconditional
+        * jump we probably need to propagate (jp) flag..
+        */
+}
+
+/* this is needed to ensure later RA stage succeeds: */
+static void
+sched_insert_parallel_copies(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+                       struct ir3_register *reg;
+                       foreach_src(reg, instr) {
+                               struct ir3_instruction *src = reg->instr;
+                               struct ir3_instruction *mov =
+                                       ir3_MOV(src->block, src, TYPE_U32);
+                               mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+                               mov->regs[0]->instr = instr;
+                               reg->instr = mov;
+                       }
+               }
+       }
  }
  
  int ir3_sched(struct ir3 *ir)
  {
         struct ir3_sched_ctx ctx = {0};
-       ir3_clear_mark(ir->block->shader);
-       sched_block(&ctx, ir->block);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_insert_parallel_copies(block);
+       }
+       ir3_clear_mark(ir);
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_block(&ctx, block);
+       }
         if (ctx.error)
                 return -1;
         return 0;
author	Rob Clark <robdclark@gmail.com>
	Tue, 9 Jun 2015 21:17:06 +0000 (17:17 -0400)
committer	Rob Clark <robclark@freedesktop.org>
	Sun, 21 Jun 2015 11:54:38 +0000 (07:54 -0400)
src/gallium/drivers/freedreno/ir3/ir3.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3.h		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_cp.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_depth.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_group.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_legalize.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_print.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_ra.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_sched.c		patch \| blob \| history