From: Rob Clark Date: Sat, 25 Jul 2015 16:53:23 +0000 (-0400) Subject: freedreno/ir3: add transform-feedback support X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=98a4b111fbb9e3ae45e907ddd4d2407e5ab669ec;p=mesa.git freedreno/ir3: add transform-feedback support Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index b28d315cc12..97e4161ede2 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + if (is_a3xx(screen) || is_a4xx(screen)) + return PIPE_MAX_SO_BUFFERS; + return 0; case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + if (is_a3xx(screen) || is_a4xx(screen)) + return 1; + return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + if (is_a3xx(screen) || is_a4xx(screen)) + return 16; /* should only be shader out limit? */ + return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + if (is_a3xx(screen) || is_a4xx(screen)) + return 16; /* should only be shader out limit? */ return 0; /* Geometry shader output, unsupported. */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index a4b27854433..53faf16ae30 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler, * 4 * vec4 - UBO addresses * if (vertex shader) { * 1 * vec4 - driver params (IR3_DP_*) + * 1 * vec4 - stream-out addresses * } * * TODO this could be made more dynamic, to at least skip sections @@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler, if (so->type == SHADER_VERTEX) { /* one (vec4) slot for driver params (see ir3_driver_param): */ so->first_immediate++; + /* one (vec4) slot for stream-output base addresses: */ + so->first_immediate++; } return ctx; @@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) } } +/* emit stream-out code. At this point, the current block is the original + * (nir) end block, and nir ensures that all flow control paths terminate + * into the end block. We re-purpose the original end block to generate + * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional + * block holding stream-out write instructions, followed by the new end + * block: + * + * blockOrigEnd { + * p0.x = (vtxcnt < maxvtxcnt) + * // succs: blockStreamOut, blockNewEnd + * } + * blockStreamOut { + * ... stream-out instructions ... + * // succs: blockNewEnd + * } + * blockNewEnd { + * } + */ +static void +emit_stream_out(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *v = ctx->so; + struct ir3 *ir = ctx->ir; + struct pipe_stream_output_info *strmout = + &ctx->so->shader->stream_output; + struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; + struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; + struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS]; + + /* create vtxcnt input in input block at top of shader, + * so that it is seen as live over the entire duration + * of the shader: + */ + vtxcnt = create_input(ctx->in_block, 0); + add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt); + + maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); + + /* at this point, we are at the original 'end' block, + * re-purpose this block to stream-out condition, then + * append stream-out block and new-end block + */ + orig_end_block = ctx->block; + + stream_out_block = ir3_block_create(ir); + list_addtail(&stream_out_block->node, &ir->block_list); + + new_end_block = ir3_block_create(ir); + list_addtail(&new_end_block->node, &ir->block_list); + + orig_end_block->successors[0] = stream_out_block; + orig_end_block->successors[1] = new_end_block; + stream_out_block->successors[0] = new_end_block; + + /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ + cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); + cond->regs[0]->num = regid(REG_P0, 0); + cond->cat2.condition = IR3_COND_LT; + + /* condition goes on previous block to the conditional, + * since it is used to pick which of the two successor + * paths to take: + */ + orig_end_block->condition = cond; + + /* switch to stream_out_block to generate the stream-out + * instructions: + */ + ctx->block = stream_out_block; + + /* Calculate base addresses based on vtxcnt. Instructions + * generated for bases not used in following loop will be + * stripped out in the backend. + */ + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + unsigned stride = strmout->stride[i]; + struct ir3_instruction *base, *off; + + base = create_uniform(ctx, regid(v->first_driver_param + 5, i)); + + /* 24-bit should be enough: */ + off = ir3_MUL_U(ctx->block, vtxcnt, 0, + create_immed(ctx->block, stride * 4), 0); + + bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); + } + + /* Generate the per-output store instructions: */ + for (unsigned i = 0; i < strmout->num_outputs; i++) { + for (unsigned j = 0; j < strmout->output[i].num_components; j++) { + unsigned c = j + strmout->output[i].start_component; + struct ir3_instruction *base, *out, *stg; + + base = bases[strmout->output[i].output_buffer]; + out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; + + stg = ir3_STG(ctx->block, base, 0, out, 0, + create_immed(ctx->block, 1), 0); + stg->cat6.type = TYPE_U32; + stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; + + array_insert(ctx->ir->keeps, stg); + } + } + + /* and finally switch to the new_end_block: */ + ctx->block = new_end_block; +} + static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { @@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl) * into which we emit the 'end' instruction. */ compile_assert(ctx, list_empty(&ctx->block->instr_list)); + + /* If stream-out (aka transform-feedback) enabled, emit the + * stream-out instructions, followed by a new empty block (into + * which the 'end' instruction lands). + * + * NOTE: it is done in this order, rather than inserting before + * we emit end_block, because NIR guarantees that all blocks + * flow into end_block, and that end_block has no successors. + * So by re-purposing end_block as the first block of stream- + * out, we guarantee that all exit paths flow into the stream- + * out instructions. + */ + if ((ctx->so->shader->stream_output.num_outputs > 0) && + !ctx->so->key.binning_pass) { + debug_assert(ctx->so->type == SHADER_VERTEX); + emit_stream_out(ctx); + } + ir3_END(ctx->block); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 166eb007dbb..312174c0c6d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -466,10 +466,10 @@ static void emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - if (v->constlen > v->first_driver_param) { + uint32_t offset = v->first_driver_param; /* UBOs after user consts */ + if (v->constlen > offset) { struct fd_context *ctx = fd_context(v->shader->pctx); - uint32_t offset = v->first_driver_param; /* UBOs after user consts */ - uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4; + uint32_t params = MIN2(4, v->constlen - offset) * 4; uint32_t offsets[params]; struct fd_bo *bos[params]; @@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) } } +/* emit stream-out buffers: */ +static void +emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/ + if (v->constlen > offset) { + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t params = 4; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + + if (target) { + offsets[i] = (so->offsets[i] * info->stride[i] * 4) + + target->buffer_offset; + bos[i] = fd_resource(target->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + fd_wfi(ctx, ring); + ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets); + } +} + +static uint32_t +max_tf_vtx(struct ir3_shader_variant *v) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t maxvtxcnt = 0x7fffffff; + + if (v->key.binning_pass) + return 0; + if (v->shader->stream_output.num_outputs == 0) + return 0; + if (so->num_targets == 0) + return 0; + + /* offset to write to is: + * + * total_vtxcnt = vtxcnt + offsets[i] + * offset = total_vtxcnt * stride[i] + * + * offset = vtxcnt * stride[i] ; calculated in shader + * + offsets[i] * stride[i] ; calculated at emit_tfbos() + * + * assuming for each vtx, each target buffer will have data written + * up to 'offset + stride[i]', that leaves maxvtxcnt as: + * + * buffer_size = (maxvtxcnt * stride[i]) + stride[i] + * maxvtxcnt = (buffer_size - stride[i]) / stride[i] + * + * but shader is actually doing a less-than (rather than less-than- + * equal) check, so we can drop the -stride[i]. + * + * TODO is assumption about `offset + stride[i]` legit? + */ + for (unsigned i = 0; i < so->num_targets; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ + if (target) { + uint32_t max = target->buffer_size / stride; + maxvtxcnt = MIN2(maxvtxcnt, max); + } + } + + return maxvtxcnt; +} + void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, const struct pipe_draw_info *info, uint32_t dirty) @@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */ if (v->constlen >= offset) { uint32_t vertex_params[4] = { - [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, + [IR3_DP_VTXID_BASE] = info->indexed ? + info->index_bias : info->start, + [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v), }; fd_wfi(ctx, ring); ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0, ARRAY_SIZE(vertex_params), vertex_params, NULL); + + /* if needed, emit stream-out buffer addresses: */ + if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { + emit_tfbos(v, ring); + } } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 4cb25205324..c0fd44d4ed1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -37,6 +37,7 @@ /* driver param indices: */ enum ir3_driver_param { IR3_DP_VTXID_BASE = 0, + IR3_DP_VTXCNT_MAX = 1, }; /* internal semantic used for passing vtxcnt to vertex shader to