freedreno/ir3: re-work shader inputs/outputs

author Rob Clark <robdclark@chromium.org>

Fri, 25 Oct 2019 22:37:56 +0000 (15:37 -0700)

committer Rob Clark <robdclark@chromium.org>

Tue, 12 Nov 2019 21:57:52 +0000 (13:57 -0800)
author Rob Clark <robdclark@chromium.org>
Fri, 25 Oct 2019 22:37:56 +0000 (15:37 -0700)
committer Rob Clark <robdclark@chromium.org>
Tue, 12 Nov 2019 21:57:52 +0000 (13:57 -0800)
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c

index 76ee44d80f5c58bd394898c4375d12e8f1ed2f2a..b89e9b316a59cef1e1f600f0977bb5a90551c38c 100644 (file)
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -45,18 +45,12 @@ void * ir3_alloc(struct ir3 *shader, int sz)
         return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
  }
  
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               gl_shader_stage type, unsigned nin, unsigned nout)
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type)
  {
         struct ir3 *shader = rzalloc(NULL, struct ir3);
  
         shader->compiler = compiler;
         shader->type = type;
-       shader->ninputs = nin;
-       shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
-
-       shader->noutputs = nout;
-       shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
  
         list_inithead(&shader->block_list);
         list_inithead(&shader->array_list);
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index 3e4fa34aa0f6c14e19bce139f11d70cbc13100a8..afff38b9b60de675ea61f755cef5a1e3011e10ea 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -268,11 +268,19 @@ struct ir3_instruction {
                 struct {
                         int off;              /* component/offset */
                 } split;
+               struct {
+                       /* for output collects, this maps back to the entry in the
+                        * ir3_shader_variant::outputs table.
+                        */
+                       int outidx;
+               } collect;
                 struct {
                         unsigned samp, tex;
                         unsigned input_offset;
                 } prefetch;
                 struct {
+                       /* maps back to entry in ir3_shader_variant::inputs table: */
+                       int inidx;
                         /* for sysvals, identifies the sysval type.  Mostly so we can
                          * identify the special cases where a sysval should not be DCE'd
                          * (currently, just pre-fs texture fetch)
@@ -425,9 +433,8 @@ struct ir3 {
         struct ir3_compiler *compiler;
         gl_shader_stage type;
  
-       unsigned ninputs, noutputs;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction **outputs;
+       DECLARE_ARRAY(struct ir3_instruction *, inputs);
+       DECLARE_ARRAY(struct ir3_instruction *, outputs);
  
         /* Track bary.f (and ldlv) instructions.. this is needed in
          * scheduling to ensure that all varying fetches happen before
@@ -537,8 +544,7 @@ block_id(struct ir3_block *block)
  #endif
  }
  
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               gl_shader_stage type, unsigned nin, unsigned nout);
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
  void ir3_destroy(struct ir3 *shader);
  void * ir3_assemble(struct ir3 *shader,
                 struct ir3_info *info, uint32_t gpu_id);
@@ -1065,14 +1071,14 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
  
  /* iterators for shader inputs: */
  #define foreach_input_n(__ininstr, __cnt, __ir) \
-       for (unsigned __cnt = 0; __cnt < (__ir)->ninputs; __cnt++) \
+       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
                 if ((__ininstr = (__ir)->inputs[__cnt]))
  #define foreach_input(__ininstr, __ir) \
         foreach_input_n(__ininstr, __i, __ir)
  
  /* iterators for shader outputs: */
  #define foreach_output_n(__outinstr, __cnt, __ir) \
-       for (unsigned __cnt = 0; __cnt < (__ir)->noutputs; __cnt++) \
+       for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
                 if ((__outinstr = (__ir)->outputs[__cnt]))
  #define foreach_output(__outinstr, __ir) \
         foreach_output_n(__outinstr, __i, __ir)
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c

index e0a9d05fda31e15a62acec3b8866f927ea1dbfcf..58d515b1e227f0a679d334a7672501031c3cdba2 100644 (file)
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -70,13 +70,9 @@ create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
         in->input.sysval = ~0;
         __ssa_dst(in)->wrmask = compmask;
  
-       return in;
-}
+       array_insert(ctx->ir, ctx->ir->inputs, in);
  
-static struct ir3_instruction *
-create_input(struct ir3_context *ctx, unsigned n)
-{
-       return create_input_compmask(ctx, n, 0x1);
+       return in;
  }
  
  static struct ir3_instruction *
@@ -1198,21 +1194,17 @@ static void add_sysval_input_compmask(struct ir3_context *ctx,
                 struct ir3_instruction *instr)
  {
         struct ir3_shader_variant *so = ctx->so;
-       unsigned r = regid(so->inputs_count, 0);
         unsigned n = so->inputs_count++;
  
         assert(instr->opc == OPC_META_INPUT);
+       instr->input.inidx = n;
         instr->input.sysval = slot;
  
         so->inputs[n].sysval = true;
         so->inputs[n].slot = slot;
         so->inputs[n].compmask = compmask;
-       so->inputs[n].regid = r;
         so->inputs[n].interpolate = INTERP_MODE_FLAT;
         so->total_in++;
-
-       ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
-       ctx->ir->inputs[r] = instr;
  }
  
  static struct ir3_instruction *
@@ -1521,17 +1513,17 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
                         idx += nir_src_as_uint(intr->src[0]);
                         for (int i = 0; i < intr->num_components; i++) {
                                 unsigned n = idx * 4 + i + comp;
-                               dst[i] = ctx->ir->inputs[n];
-                               compile_assert(ctx, ctx->ir->inputs[n]);
+                               dst[i] = ctx->inputs[n];
+                               compile_assert(ctx, ctx->inputs[n]);
                         }
                 } else {
                         src = ir3_get_src(ctx, &intr->src[0]);
                         struct ir3_instruction *collect =
-                                       ir3_create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+                                       ir3_create_collect(ctx, ctx->ir->inputs, ctx->ninputs);
                         struct ir3_instruction *addr = ir3_get_addr(ctx, src[0], 4);
                         for (int i = 0; i < intr->num_components; i++) {
                                 unsigned n = idx * 4 + i + comp;
-                               dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+                               dst[i] = create_indirect_load(ctx, ctx->ninputs,
                                                 n, addr, collect);
                         }
                 }
@@ -1632,7 +1624,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
                 src = ir3_get_src(ctx, &intr->src[0]);
                 for (int i = 0; i < intr->num_components; i++) {
                         unsigned n = idx * 4 + i + comp;
-                       ctx->ir->outputs[n] = src[i];
+                       ctx->outputs[n] = src[i];
                 }
                 break;
         case nir_intrinsic_load_base_vertex:
@@ -2715,15 +2707,27 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
                                 instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx);
                         }
  
-                       compile_assert(ctx, idx < ctx->ir->ninputs);
+                       compile_assert(ctx, idx < ctx->ninputs);
  
-                       ctx->ir->inputs[idx] = instr;
+                       ctx->inputs[idx] = instr;
                 }
         } else if (ctx->so->type == MESA_SHADER_VERTEX) {
+               /* We shouldn't have fractional input for VS input.. that only shows
+                * up with varying packing
+                */
+               assert(frac == 0);
+
+               struct ir3_instruction *input = create_input_compmask(ctx, 0, (1 << ncomp) - 1);
+               struct ir3_instruction *components[ncomp];
+
+               input->input.inidx = n;
+
+               ir3_split_dest(ctx->block, components, input, 0, ncomp);
+
                 for (int i = 0; i < ncomp; i++) {
                         unsigned idx = (n * 4) + i + frac;
-                       compile_assert(ctx, idx < ctx->ir->ninputs);
-                       ctx->ir->inputs[idx] = create_input(ctx, idx);
+                       compile_assert(ctx, idx < ctx->ninputs);
+                       ctx->inputs[idx] = components[i];
                 }
         } else {
                 ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
@@ -2904,8 +2908,8 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
  
         for (int i = 0; i < ncomp; i++) {
                 unsigned idx = (n * 4) + i + frac;
-               compile_assert(ctx, idx < ctx->ir->noutputs);
-               ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+               compile_assert(ctx, idx < ctx->noutputs);
+               ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
         }
  
         /* if varying packing doesn't happen, we could end up in a situation
@@ -2918,8 +2922,8 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
          */
         for (int i = 0; i < frac; i++) {
                 unsigned idx = (n * 4) + i;
-               if (!ctx->ir->outputs[idx]) {
-                       ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+               if (!ctx->outputs[idx]) {
+                       ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
                 }
         }
  }
@@ -2934,33 +2938,18 @@ max_drvloc(struct exec_list *vars)
         return drvloc;
  }
  
-static const unsigned max_sysvals[] = {
-       [MESA_SHADER_VERTEX]  = 16,
-       [MESA_SHADER_TESS_CTRL] = 16,
-       [MESA_SHADER_TESS_EVAL] = 16,
-       [MESA_SHADER_GEOMETRY] = 16,
-       [MESA_SHADER_FRAGMENT] = 24,  // TODO
-       [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
-       [MESA_SHADER_KERNEL]  = 16, // TODO how many do we actually need?
-};
-
  static void
  emit_instructions(struct ir3_context *ctx)
  {
-       unsigned ninputs, noutputs;
         nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
  
-       ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
-       noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+       ctx->ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+       ctx->noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
  
-       /* we need to leave room for sysvals:
-        */
-       ninputs += max_sysvals[ctx->so->type];
-       if (ctx->so->type == MESA_SHADER_VERTEX ||
-                       ctx->so->type == MESA_SHADER_TESS_EVAL)
-               noutputs += 8; /* gs or tess header + primitive_id */
+       ctx->inputs  = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
+       ctx->outputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
  
-       ctx->ir = ir3_create(ctx->compiler, ctx->so->type, ninputs, noutputs);
+       ctx->ir = ir3_create(ctx->compiler, ctx->so->type);
  
         /* Create inputs in first block: */
         ctx->block = get_block(ctx, nir_start_block(fxn));
@@ -3039,40 +3028,6 @@ emit_instructions(struct ir3_context *ctx)
                 setup_output(ctx, var);
         }
  
-       /* Set up the shared system values as outputs for the vertex and tess eval
-        * shaders so they don't clobber them for the next shader in the pipeline.
-        */
-       if (ctx->so->type == MESA_SHADER_VERTEX ||
-                       (has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
-               struct ir3_shader_variant *so = ctx->so;
-               if (ctx->primitive_id) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
-                       so->outputs[n].regid = regid(n, 0);
-                       ctx->ir->outputs[n * 4] = ctx->primitive_id;
-
-                       compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-               }
-
-               if (ctx->gs_header) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
-                       so->outputs[n].regid = regid(n, 0);
-                       ctx->ir->outputs[n * 4] = ctx->gs_header;
-
-                       compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-               }
-
-               if (ctx->tcs_header) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
-                       so->outputs[n].regid = regid(n, 0);
-                       ctx->ir->outputs[n * 4] = ctx->tcs_header;
-
-                       compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-               }
-       }
-
         /* Find # of samplers: */
         nir_foreach_variable(var, &ctx->s->uniforms) {
                 ctx->so->num_samp += glsl_type_get_sampler_count(var->type);
@@ -3092,28 +3047,6 @@ emit_instructions(struct ir3_context *ctx)
         emit_function(ctx, fxn);
  }
  
-/* from NIR perspective, we actually have varying inputs.  But the varying
- * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
- * only actual inputs are the sysvals.
- */
-static void
-fixup_frag_inputs(struct ir3_context *ctx)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       struct ir3 *ir = ctx->ir;
-       unsigned i = 0;
-
-       /* sysvals should appear at the end of the inputs, drop everything else: */
-       while ((i < so->inputs_count) && !so->inputs[i].sysval)
-               i++;
-
-       /* at IR level, inputs are always blocks of 4 scalars: */
-       i *= 4;
-
-       ir->inputs = &ir->inputs[i];
-       ir->ninputs -= i;
-}
-
  /* Fixup tex sampler state for astc/srgb workaround instructions.  We
   * need to assign the tex state indexes for these after we know the
   * max tex index.
@@ -3155,23 +3088,44 @@ fixup_binning_pass(struct ir3_context *ctx)
         struct ir3 *ir = ctx->ir;
         unsigned i, j;
  
+       /* first pass, remove unused outputs from the IR level outputs: */
+       for (i = 0, j = 0; i < ir->outputs_count; i++) {
+               struct ir3_instruction *out = ir->outputs[i];
+               assert(out->opc == OPC_META_COLLECT);
+               unsigned outidx = out->collect.outidx;
+               unsigned slot = so->outputs[outidx].slot;
+
+               /* throw away everything but first position/psize */
+               if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+                       ir->outputs[j] = ir->outputs[i];
+                       j++;
+               }
+       }
+       ir->outputs_count = j;
+
+       /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
+        * table:
+        */
         for (i = 0, j = 0; i < so->outputs_count; i++) {
                 unsigned slot = so->outputs[i].slot;
  
                 /* throw away everything but first position/psize */
                 if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
-                       if (i != j) {
-                               so->outputs[j] = so->outputs[i];
-                               ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
-                               ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
-                               ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
-                               ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+                       so->outputs[j] = so->outputs[i];
+
+                       /* fixup outidx to point to new output table entry: */
+                       struct ir3_instruction *out;
+                       foreach_output(out, ir) {
+                               if (out->collect.outidx == i) {
+                                       out->collect.outidx = j;
+                                       break;
+                               }
                         }
+
                         j++;
                 }
         }
         so->outputs_count = j;
-       ir->noutputs = j * 4;
  }
  
  static void
@@ -3215,8 +3169,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
  {
         struct ir3_context *ctx;
         struct ir3 *ir;
-       struct ir3_instruction **inputs;
-       unsigned i;
         int ret = 0, max_bary;
  
         assert(!so->ir);
@@ -3238,12 +3190,81 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
  
         ir = so->ir = ctx->ir;
  
-       /* keep track of the inputs from TGSI perspective.. */
-       inputs = ir->inputs;
+       assert((ctx->noutputs % 4) == 0);
  
-       /* but fixup actual inputs for frag shader: */
-       if (so->type == MESA_SHADER_FRAGMENT)
-               fixup_frag_inputs(ctx);
+       /* Setup IR level outputs, which are "collects" that gather
+        * the scalar components of outputs.
+        */
+       for (unsigned i = 0; i < ctx->noutputs; i += 4) {
+               unsigned ncomp = 0;
+               /* figure out the # of components written:
+                *
+                * TODO do we need to handle holes, ie. if .x and .z
+                * components written, but .y component not written?
+                */
+               for (unsigned j = 0; j < 4; j++) {
+                       if (!ctx->outputs[i + j])
+                               break;
+                       ncomp++;
+               }
+
+               /* Note that in some stages, like TCS, store_output is
+                * lowered to memory writes, so no components of the
+                * are "written" from the PoV of traditional store-
+                * output instructions:
+                */
+               if (!ncomp)
+                       continue;
+
+               struct ir3_instruction *out =
+                       ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
+
+               int outidx = i / 4;
+               assert(outidx < so->outputs_count);
+
+               /* stash index into so->outputs[] so we can map the
+                * output back to slot/etc later:
+                */
+               out->collect.outidx = outidx;
+
+               array_insert(ir, ir->outputs, out);
+       }
+
+       /* Set up the gs header as an output for the vertex shader so it won't
+        * clobber it for the tess ctrl shader.
+        *
+        * TODO this could probably be done more cleanly in a nir pass.
+        */
+       if (ctx->so->type == MESA_SHADER_VERTEX ||
+                       (ctx->so->key.has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
+               if (ctx->primitive_id) {
+                       unsigned n = so->outputs_count++;
+                       so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
+
+                       struct ir3_instruction *out =
+                               ir3_create_collect(ctx, &ctx->primitive_id, 1);
+                       out->collect.outidx = n;
+                       array_insert(ir, ir->outputs, out);
+               }
+
+               if (ctx->gs_header) {
+                       unsigned n = so->outputs_count++;
+                       so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
+                       struct ir3_instruction *out =
+                               ir3_create_collect(ctx, &ctx->gs_header, 1);
+                       out->collect.outidx = n;
+                       array_insert(ir, ir->outputs, out);
+               }
+
+               if (ctx->tcs_header) {
+                       unsigned n = so->outputs_count++;
+                       so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
+                       struct ir3_instruction *out =
+                               ir3_create_collect(ctx, &ctx->tcs_header, 1);
+                       out->collect.outidx = n;
+                       array_insert(ir, ir->outputs, out);
+               }
+       }
  
         /* at this point, for binning pass, throw away unneeded outputs: */
         if (so->binning_pass && (ctx->compiler->gpu_id < 600))
@@ -3267,8 +3288,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
          */
         if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
                         so->type == MESA_SHADER_VERTEX) {
-               for (int i = 0; i < ir->ninputs; i++) {
-                       struct ir3_instruction *in = ir->inputs[i];
+               for (int i = 0; i < ctx->ninputs; i++) {
+                       struct ir3_instruction *in = ctx->inputs[i];
  
                         if (!in)
                                 continue;
@@ -3287,20 +3308,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
                 }
         }
  
-       /* Insert mov if there's same instruction for each output.
-        * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
-        */
-       for (int i = ir->noutputs - 1; i >= 0; i--) {
-               if (!ir->outputs[i])
-                       continue;
-               for (unsigned j = 0; j < i; j++) {
-                       if (ir->outputs[i] == ir->outputs[j]) {
-                               ir->outputs[i] =
-                                       ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
-                       }
-               }
-       }
-
         ir3_debug_print(ir, "BEFORE GROUPING");
  
         ir3_sched_add_deps(ir);
@@ -3342,8 +3349,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
                         so->binning_pass;
  
         if (pre_assign_inputs) {
-               for (unsigned i = 0; i < ir->ninputs; i++) {
-                       struct ir3_instruction *instr = ir->inputs[i];
+               for (unsigned i = 0; i < ctx->ninputs; i++) {
+                       struct ir3_instruction *instr = ctx->inputs[i];
  
                         if (!instr)
                                 continue;
@@ -3355,7 +3362,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
                         instr->regs[0]->num = regid;
                 }
  
-               ret = ir3_ra(so, ir->inputs, ir->ninputs);
+               ret = ir3_ra(so, ctx->inputs, ctx->ninputs);
         } else if (ctx->tcs_header) {
                 /* We need to have these values in the same registers between VS and TCS
                  * since the VS chains to TCS and doesn't get the sysvals redelivered.
@@ -3406,48 +3413,36 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
         if (so->type == MESA_SHADER_FRAGMENT)
                 pack_inlocs(ctx);
  
-       /* fixup input/outputs: */
-       for (i = 0; i < so->outputs_count; i++) {
-               /* sometimes we get outputs that don't write the .x coord, like:
-                *
-                *   decl_var shader_out INTERP_MODE_NONE float Color (VARYING_SLOT_VAR9.z, 1, 0)
-                *
-                * Presumably the result of varying packing and then eliminating
-                * some unneeded varyings?  Just skip head to the first valid
-                * component of the output.
-                */
-               for (unsigned j = 0; j < 4; j++) {
-                       struct ir3_instruction *instr = ir->outputs[(i*4) + j];
-                       if (instr) {
-                               so->outputs[i].regid = instr->regs[0]->num;
-                               so->outputs[i].half  = !!(instr->regs[0]->flags & IR3_REG_HALF);
-                               break;
-                       }
-               }
-       }
+       /*
+        * Fixup inputs/outputs to point to the actual registers assigned:
+        *
+        * 1) initialize to r63.x (invalid/unused)
+        * 2) iterate IR level inputs/outputs and update the variants
+        *    inputs/outputs table based on the assigned registers for
+        *    the remaining inputs/outputs.
+        */
  
-       /* Note that some or all channels of an input may be unused: */
-       for (i = 0; i < so->inputs_count; i++) {
-               unsigned j, reg = regid(63,0);
-               bool half = false;
-               for (j = 0; j < 4; j++) {
-                       struct ir3_instruction *in = inputs[(i*4) + j];
+       for (unsigned i = 0; i < so->inputs_count; i++)
+               so->inputs[i].regid = regid(63, 0);
+       for (unsigned i = 0; i < so->outputs_count; i++)
+               so->outputs[i].regid = regid(63, 0);
  
-                       if (!in)
-                               continue;
+       struct ir3_instruction *out;
+       foreach_output(out, ir) {
+               assert(out->opc == OPC_META_COLLECT);
+               unsigned outidx = out->collect.outidx;
  
-                       if (in->flags & IR3_INSTR_UNUSED)
-                               continue;
+               so->outputs[outidx].regid = out->regs[0]->num;
+               so->outputs[outidx].half  = !!(out->regs[0]->flags & IR3_REG_HALF);
+       }
  
-                       reg = in->regs[0]->num - j;
-                       if (half) {
-                               compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF);
-                       } else {
-                               half = !!(in->regs[0]->flags & IR3_REG_HALF);
-                       }
-               }
-               so->inputs[i].regid = reg;
-               so->inputs[i].half  = half;
+       struct ir3_instruction *in;
+       foreach_input(in, ir) {
+               assert(in->opc == OPC_META_INPUT);
+               unsigned inidx = in->input.inidx;
+
+               so->inputs[inidx].regid = in->regs[0]->num;
+               so->inputs[inidx].half  = !!(in->regs[0]->flags & IR3_REG_HALF);
         }
  
         if (ctx->astc_srgb)
diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h

index bb283a76326a7f6ff42e6485f4ef626beec63e36..1ce5c6776cad5cbea123e215ba5ab9597c844dee 100644 (file)
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -52,6 +52,18 @@ struct ir3_context {
         struct ir3 *ir;
         struct ir3_shader_variant *so;
  
+       /* Tables of scalar inputs/outputs.  Because of the way varying packing
+        * works, we could have inputs w/ fractional location, which is a bit
+        * awkward to deal with unless we keep track of the split scalar in/
+        * out components.
+        *
+        * These *only* have inputs/outputs that are touched by load_*input and
+        * store_output.
+        */
+       unsigned ninputs, noutputs;
+       struct ir3_instruction **inputs;
+       struct ir3_instruction **outputs;
+
         struct ir3_block *block;      /* the current block */
         struct ir3_block *in_block;   /* block created for shader inputs */
  
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c

index a0b853ca1590e1738cee24665c1e1b4969270806..f86397565f0e646395f068cf64c42adb1cd58d6b 100644 (file)
--- a/src/freedreno/ir3/ir3_group.c
+++ b/src/freedreno/ir3/ir3_group.c
@@ -39,41 +39,6 @@ struct group_ops {
         void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
  };
  
-static struct ir3_instruction *arr_get(void *arr, int idx)
-{
-       return ((struct ir3_instruction **)arr)[idx];
-}
-static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
-{
-       ((struct ir3_instruction **)arr)[idx] =
-                       ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
-{
-       /* so, we can't insert a mov in front of a meta:in.. and the downstream
-        * instruction already has a pointer to 'instr'.  So we cheat a bit and
-        * morph the meta:in instruction into a mov and insert a new meta:in
-        * in front.
-        */
-       struct ir3_instruction *in;
-
-       debug_assert(instr->regs_count == 1);
-
-       in = ir3_instr_create(instr->block, OPC_META_INPUT);
-       in->input.sysval = instr->input.sysval;
-       __ssa_dst(in);
-
-       /* create src reg for meta:in and fixup to now be a mov: */
-       __ssa_src(instr, in, 0);
-       instr->opc = OPC_MOV;
-       instr->cat1.src_type = TYPE_F32;
-       instr->cat1.dst_type = TYPE_F32;
-
-       ((struct ir3_instruction **)arr)[idx] = in;
-}
-static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
-static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
-
  static struct ir3_instruction *instr_get(void *arr, int idx)
  {
         return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
@@ -192,61 +157,11 @@ instr_find_neighbors(struct ir3_instruction *instr)
                 instr_find_neighbors(src);
  }
  
-/* a bit of sadness.. we can't have "holes" in inputs from PoV of
- * register assignment, they still need to be grouped together.  So
- * we need to insert dummy/padding instruction for grouping, and
- * then take it back out again before anyone notices.
- */
-static void
-pad_and_group_input(struct ir3_instruction **input, unsigned n)
-{
-       int i, mask = 0;
-       struct ir3_block *block = NULL;
-
-       for (i = n - 1; i >= 0; i--) {
-               struct ir3_instruction *instr = input[i];
-               if (instr) {
-                       block = instr->block;
-               } else if (block) {
-                       instr = ir3_NOP(block);
-                       __ssa_dst(instr);          /* dummy dst */
-                       input[i] = instr;
-                       mask |= (1 << i);
-               }
-       }
-
-       group_n(&arr_ops_in, input, n);
-
-       for (i = 0; i < n; i++) {
-               if (mask & (1 << i))
-                       input[i] = NULL;
-       }
-}
-
  static void
  find_neighbors(struct ir3 *ir)
  {
         unsigned i;
  
-       /* shader inputs/outputs themselves must be contiguous as well:
-        *
-        * NOTE: group inputs first, since we only insert mov's
-        * *before* the conflicted instr (and that would go badly
-        * for inputs).  By doing inputs first, we should never
-        * have a conflict on inputs.. pushing any conflict to
-        * resolve to the outputs, for stuff like:
-        *
-        *     MOV OUT[n], IN[m].wzyx
-        *
-        * NOTE: we assume here inputs/outputs are grouped in vec4.
-        * This logic won't quite cut it if we don't align smaller
-        * on vec4 boundaries
-        */
-       for (i = 0; i < ir->ninputs; i += 4)
-               pad_and_group_input(&ir->inputs[i], 4);
-       for (i = 0; i < ir->noutputs; i += 4)
-               group_n(&arr_ops_out, &ir->outputs[i], 4);
-
         struct ir3_instruction *out;
         foreach_output(out, ir)
                 instr_find_neighbors(out);
author	Rob Clark <robdclark@chromium.org>
	Fri, 25 Oct 2019 22:37:56 +0000 (15:37 -0700)
committer	Rob Clark <robdclark@chromium.org>
	Tue, 12 Nov 2019 21:57:52 +0000 (13:57 -0800)
src/freedreno/ir3/ir3.c		patch \| blob \| history
src/freedreno/ir3/ir3.h		patch \| blob \| history
src/freedreno/ir3/ir3_compiler_nir.c		patch \| blob \| history
src/freedreno/ir3/ir3_context.h		patch \| blob \| history
src/freedreno/ir3/ir3_group.c		patch \| blob \| history