freedreno/ir3: add transform-feedback support
authorRob Clark <robclark@freedesktop.org>
Sat, 25 Jul 2015 16:53:23 +0000 (12:53 -0400)
committerRob Clark <robclark@freedesktop.org>
Mon, 27 Jul 2015 17:51:06 +0000 (13:51 -0400)
Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
src/gallium/drivers/freedreno/ir3/ir3_shader.c
src/gallium/drivers/freedreno/ir3/ir3_shader.h

index b28d315cc1272e2b5c65b449809becec232e56c6..97e4161ede216867c62ab4594c2c1b848e5b5756 100644 (file)
@@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
        /* Stream output. */
        case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return PIPE_MAX_SO_BUFFERS;
+               return 0;
        case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 1;
+               return 0;
        case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 16;    /* should only be shader out limit? */
+               return 0;
        case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 16;    /* should only be shader out limit? */
                return 0;
 
        /* Geometry shader output, unsupported. */
index a4b27854433636bc2d48a3c47fe397e66dd71dbf..53faf16ae307d0801255978a1c0e8f5377825aee 100644 (file)
@@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler,
         *    4 * vec4            -  UBO addresses
         *    if (vertex shader) {
         *        1 * vec4        -  driver params (IR3_DP_*)
+        *        1 * vec4        -  stream-out addresses
         *    }
         *
         * TODO this could be made more dynamic, to at least skip sections
@@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler,
        if (so->type == SHADER_VERTEX) {
                /* one (vec4) slot for driver params (see ir3_driver_param): */
                so->first_immediate++;
+               /* one (vec4) slot for stream-output base addresses: */
+               so->first_immediate++;
        }
 
        return ctx;
@@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
        }
 }
 
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_compile *ctx)
+{
+       struct ir3_shader_variant *v = ctx->so;
+       struct ir3 *ir = ctx->ir;
+       struct pipe_stream_output_info *strmout =
+                       &ctx->so->shader->stream_output;
+       struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+       struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+       struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
+
+       /* create vtxcnt input in input block at top of shader,
+        * so that it is seen as live over the entire duration
+        * of the shader:
+        */
+       vtxcnt = create_input(ctx->in_block, 0);
+       add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+
+       maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+       /* at this point, we are at the original 'end' block,
+        * re-purpose this block to stream-out condition, then
+        * append stream-out block and new-end block
+        */
+       orig_end_block = ctx->block;
+
+       stream_out_block = ir3_block_create(ir);
+       list_addtail(&stream_out_block->node, &ir->block_list);
+
+       new_end_block = ir3_block_create(ir);
+       list_addtail(&new_end_block->node, &ir->block_list);
+
+       orig_end_block->successors[0] = stream_out_block;
+       orig_end_block->successors[1] = new_end_block;
+       stream_out_block->successors[0] = new_end_block;
+
+       /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+       cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+       cond->regs[0]->num = regid(REG_P0, 0);
+       cond->cat2.condition = IR3_COND_LT;
+
+       /* condition goes on previous block to the conditional,
+        * since it is used to pick which of the two successor
+        * paths to take:
+        */
+       orig_end_block->condition = cond;
+
+       /* switch to stream_out_block to generate the stream-out
+        * instructions:
+        */
+       ctx->block = stream_out_block;
+
+       /* Calculate base addresses based on vtxcnt.  Instructions
+        * generated for bases not used in following loop will be
+        * stripped out in the backend.
+        */
+       for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+               unsigned stride = strmout->stride[i];
+               struct ir3_instruction *base, *off;
+
+               base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+
+               /* 24-bit should be enough: */
+               off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+                               create_immed(ctx->block, stride * 4), 0);
+
+               bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+       }
+
+       /* Generate the per-output store instructions: */
+       for (unsigned i = 0; i < strmout->num_outputs; i++) {
+               for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+                       unsigned c = j + strmout->output[i].start_component;
+                       struct ir3_instruction *base, *out, *stg;
+
+                       base = bases[strmout->output[i].output_buffer];
+                       out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+                       stg = ir3_STG(ctx->block, base, 0, out, 0,
+                                       create_immed(ctx->block, 1), 0);
+                       stg->cat6.type = TYPE_U32;
+                       stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+                       array_insert(ctx->ir->keeps, stg);
+               }
+       }
+
+       /* and finally switch to the new_end_block: */
+       ctx->block = new_end_block;
+}
+
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
@@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
         * into which we emit the 'end' instruction.
         */
        compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+       /* If stream-out (aka transform-feedback) enabled, emit the
+        * stream-out instructions, followed by a new empty block (into
+        * which the 'end' instruction lands).
+        *
+        * NOTE: it is done in this order, rather than inserting before
+        * we emit end_block, because NIR guarantees that all blocks
+        * flow into end_block, and that end_block has no successors.
+        * So by re-purposing end_block as the first block of stream-
+        * out, we guarantee that all exit paths flow into the stream-
+        * out instructions.
+        */
+       if ((ctx->so->shader->stream_output.num_outputs > 0) &&
+                       !ctx->so->key.binning_pass) {
+               debug_assert(ctx->so->type == SHADER_VERTEX);
+               emit_stream_out(ctx);
+       }
+
        ir3_END(ctx->block);
 }
 
index 166eb007dbba3b6cf6b2639bd1198cbb29af2111..312174c0c6dbf70afe9131768a963629102eeb55 100644 (file)
@@ -466,10 +466,10 @@ static void
 emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
                struct fd_constbuf_stateobj *constbuf)
 {
-       if (v->constlen > v->first_driver_param) {
+       uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+       if (v->constlen > offset) {
                struct fd_context *ctx = fd_context(v->shader->pctx);
-               uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
-               uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4;
+               uint32_t params = MIN2(4, v->constlen - offset) * 4;
                uint32_t offsets[params];
                struct fd_bo *bos[params];
 
@@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
        }
 }
 
+/* emit stream-out buffers: */
+static void
+emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+       uint32_t offset = v->first_driver_param + 5;  /* streamout addresses after driver-params*/
+       if (v->constlen > offset) {
+               struct fd_context *ctx = fd_context(v->shader->pctx);
+               struct fd_streamout_stateobj *so = &ctx->streamout;
+               struct pipe_stream_output_info *info = &v->shader->stream_output;
+               uint32_t params = 4;
+               uint32_t offsets[params];
+               struct fd_bo *bos[params];
+
+               for (uint32_t i = 0; i < params; i++) {
+                       struct pipe_stream_output_target *target = so->targets[i];
+
+                       if (target) {
+                               offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
+                                               target->buffer_offset;
+                               bos[i] = fd_resource(target->buffer)->bo;
+                       } else {
+                               offsets[i] = 0;
+                               bos[i] = NULL;
+                       }
+               }
+
+               fd_wfi(ctx, ring);
+               ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets);
+       }
+}
+
+static uint32_t
+max_tf_vtx(struct ir3_shader_variant *v)
+{
+       struct fd_context *ctx = fd_context(v->shader->pctx);
+       struct fd_streamout_stateobj *so = &ctx->streamout;
+       struct pipe_stream_output_info *info = &v->shader->stream_output;
+       uint32_t maxvtxcnt = 0x7fffffff;
+
+       if (v->key.binning_pass)
+               return 0;
+       if (v->shader->stream_output.num_outputs == 0)
+               return 0;
+       if (so->num_targets == 0)
+               return 0;
+
+       /* offset to write to is:
+        *
+        *   total_vtxcnt = vtxcnt + offsets[i]
+        *   offset = total_vtxcnt * stride[i]
+        *
+        *   offset =   vtxcnt * stride[i]       ; calculated in shader
+        *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
+        *
+        * assuming for each vtx, each target buffer will have data written
+        * up to 'offset + stride[i]', that leaves maxvtxcnt as:
+        *
+        *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
+        *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
+        *
+        * but shader is actually doing a less-than (rather than less-than-
+        * equal) check, so we can drop the -stride[i].
+        *
+        * TODO is assumption about `offset + stride[i]` legit?
+        */
+       for (unsigned i = 0; i < so->num_targets; i++) {
+               struct pipe_stream_output_target *target = so->targets[i];
+               unsigned stride = info->stride[i] * 4;   /* convert dwords->bytes */
+               if (target) {
+                       uint32_t max = target->buffer_size / stride;
+                       maxvtxcnt = MIN2(maxvtxcnt, max);
+               }
+       }
+
+       return maxvtxcnt;
+}
+
 void
 ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
                const struct pipe_draw_info *info, uint32_t dirty)
@@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
                uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
                if (v->constlen >= offset) {
                        uint32_t vertex_params[4] = {
-                               [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start,
+                               [IR3_DP_VTXID_BASE] = info->indexed ?
+                                               info->index_bias : info->start,
+                               [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
                        };
 
                        fd_wfi(ctx, ring);
                        ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
                                        ARRAY_SIZE(vertex_params), vertex_params, NULL);
+
+                       /* if needed, emit stream-out buffer addresses: */
+                       if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
+                               emit_tfbos(v, ring);
+                       }
                }
        }
 }
index 4cb252053243c2114b0aa561973ccbf7d6a97369..c0fd44d4ed10b27d709d576678d16368e04fe7da 100644 (file)
@@ -37,6 +37,7 @@
 /* driver param indices: */
 enum ir3_driver_param {
        IR3_DP_VTXID_BASE = 0,
+       IR3_DP_VTXCNT_MAX = 1,
 };
 
 /* internal semantic used for passing vtxcnt to vertex shader to