From: Christoph Bumiller Date: Thu, 3 May 2012 10:50:08 +0000 (+0200) Subject: nv50: implement stream output X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=02fac2930581b9bea9f6d221eb6d6b471fc3b9c6;p=mesa.git nv50: implement stream output --- diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h index 1cee0e06c02..44a0ba0f561 100644 --- a/src/gallium/drivers/nv50/nv50_context.h +++ b/src/gallium/drivers/nv50/nv50_context.h @@ -48,6 +48,7 @@ #define NV50_NEW_CONSTBUF (1 << 18) #define NV50_NEW_TEXTURES (1 << 19) #define NV50_NEW_SAMPLERS (1 << 20) +#define NV50_NEW_STRMOUT (1 << 21) #define NV50_NEW_CONTEXT (1 << 31) #define NV50_BIND_FB 0 @@ -56,9 +57,10 @@ #define NV50_BIND_INDEX 3 #define NV50_BIND_TEXTURES 4 #define NV50_BIND_CB(s, i) (5 + 16 * (s) + (i)) -#define NV50_BIND_SCREEN 53 -#define NV50_BIND_TLS 54 -#define NV50_BIND_COUNT 55 +#define NV50_BIND_SO 53 +#define NV50_BIND_SCREEN 54 +#define NV50_BIND_TLS 55 +#define NV50_BIND_COUNT 56 #define NV50_BIND_2D 0 #define NV50_BIND_M2MF 0 #define NV50_BIND_FENCE 1 @@ -92,11 +94,13 @@ struct nv50_context { boolean point_sprite; boolean rt_serialize; boolean flushed; + boolean rasterizer_discard; uint8_t tls_required; uint8_t num_vtxbufs; uint8_t num_vtxelts; uint8_t num_textures[3]; uint8_t num_samplers[3]; + uint8_t prim_size; uint16_t scissor; } state; @@ -126,6 +130,10 @@ struct nv50_context { struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS]; unsigned num_samplers[3]; + uint8_t num_so_targets; + uint8_t so_targets_dirty; + struct pipe_stream_output_target *so_target[4]; + struct pipe_framebuffer_state framebuffer; struct pipe_blend_color blend_colour; struct pipe_stencil_ref stencil_ref; @@ -168,6 +176,14 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *); /* nv50_query.c */ void nv50_init_query_functions(struct nv50_context *); +void nv50_query_pushbuf_submit(struct nouveau_pushbuf *, + struct pipe_query *, unsigned result_offset); +void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *); +void nva0_so_target_save_offset(struct pipe_context *, + struct pipe_stream_output_target *, + unsigned index, boolean seralize); + +#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) /* nv50_shader_state.c */ void nv50_vertprog_validate(struct nv50_context *); @@ -177,6 +193,7 @@ void nv50_fp_linkage_validate(struct nv50_context *); void nv50_gp_linkage_validate(struct nv50_context *); void nv50_constbufs_validate(struct nv50_context *); void nv50_validate_derived_rs(struct nv50_context *); +void nv50_stream_output_validate(struct nv50_context *); /* nv50_state.c */ extern void nv50_init_state_functions(struct nv50_context *); diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 1b2e2934b79..48e8db333f9 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -235,6 +235,59 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info) } } +static struct nv50_stream_output_state * +nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, + const struct pipe_stream_output_info *pso) +{ + struct nv50_stream_output_state *so; + unsigned b, i, c; + unsigned base[4]; + + so = MALLOC_STRUCT(nv50_stream_output_state); + if (!so) + return NULL; + memset(so->map, 0xff, sizeof(so->map)); + + for (b = 0; b < 4; ++b) + so->num_attribs[b] = 0; + for (i = 0; i < pso->num_outputs; ++i) { + unsigned end = pso->output[i].dst_offset + pso->output[i].num_components; + b = pso->output[i].output_buffer; + assert(b < 4); + so->num_attribs[b] = MAX2(so->num_attribs[b], end); + } + + so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED; + + so->stride[0] = pso->stride[0] * 4; + base[0] = 0; + for (b = 1; b < 4; ++b) { + assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]); + so->stride[b] = so->num_attribs[b] * 4; + if (so->num_attribs[b]) + so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT; + base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4); + } + if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) { + assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX); + so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT; + } + + so->map_size = base[3] + so->num_attribs[3]; + + for (i = 0; i < pso->num_outputs; ++i) { + const unsigned s = pso->output[i].start_component; + const unsigned p = pso->output[i].dst_offset; + const unsigned r = pso->output[i].register_index; + b = pso->output[i].output_buffer; + + for (c = 0; c < pso->output[i].num_components; ++c) + so->map[base[b] + p + c] = info->out[r].slot[s + c]; + } + + return so; +} + boolean nv50_program_translate(struct nv50_program *prog, uint16_t chipset) { @@ -293,6 +346,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset) prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL; } + if (prog->pipe.stream_output.num_outputs) + prog->so = nv50_program_create_strmout_state(info, + &prog->pipe.stream_output); + out: FREE(info); return !ret; diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 92361ad9946..f56268b5439 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -42,6 +42,15 @@ struct nv50_varying { ubyte si; /* semantic index */ }; +struct nv50_stream_output_state +{ + uint32_t ctrl; + uint16_t stride[4]; + uint8_t num_attribs[4]; + uint8_t map_size; + uint8_t map[128]; +}; + struct nv50_program { struct pipe_shader_state pipe; @@ -88,6 +97,8 @@ struct nv50_program { void *fixups; /* relocation records */ struct nouveau_heap *mem; + + struct nv50_stream_output_state *so; }; boolean nv50_program_translate(struct nv50_program *, uint16_t chipset); diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index 04e32b7e8b9..3abe189e7b5 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -210,7 +210,8 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info) { struct push_context ctx; unsigned i, index_size; - unsigned inst = info->instance_count; + unsigned inst_count = info->instance_count; + unsigned vert_count = info->count; boolean apply_bias = info->indexed && info->index_bias; ctx.push = nv50->base.pushbuf; @@ -242,6 +243,17 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info) ctx.primitive_restart = info->primitive_restart; ctx.restart_index = info->restart_index; } else { + if (unlikely(info->count_from_stream_output)) { + struct pipe_context *pipe = &nv50->base.pipe; + struct nv50_so_target *targ; + targ = nv50_so_target(info->count_from_stream_output); + if (!targ->pq) { + NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n"); + return; + } + pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count); + vert_count /= targ->stride; + } ctx.idxbuf = NULL; index_size = 0; ctx.primitive_restart = FALSE; @@ -262,21 +274,21 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info) } nv50->state.prim_restart = info->primitive_restart; - while (inst--) { + while (inst_count--) { BEGIN_NV04(ctx.push, NV50_3D(VERTEX_BEGIN_GL), 1); PUSH_DATA (ctx.push, ctx.prim); switch (index_size) { case 0: - emit_vertices_seq(&ctx, info->start, info->count); + emit_vertices_seq(&ctx, info->start, vert_count); break; case 1: - emit_vertices_i08(&ctx, info->start, info->count); + emit_vertices_i08(&ctx, info->start, vert_count); break; case 2: - emit_vertices_i16(&ctx, info->start, info->count); + emit_vertices_i16(&ctx, info->start, vert_count); break; case 4: - emit_vertices_i32(&ctx, info->start, info->count); + emit_vertices_i32(&ctx, info->start, vert_count); break; default: assert(0); diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c index 5275e74964a..8e62c5f11bc 100644 --- a/src/gallium/drivers/nv50/nv50_query.c +++ b/src/gallium/drivers/nv50/nv50_query.c @@ -36,7 +36,8 @@ struct nv50_query { uint32_t *data; - uint32_t type; + uint16_t type; + uint16_t index; uint32_t sequence; struct nouveau_bo *bo; uint32_t base; @@ -170,21 +171,15 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq) BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); PUSH_DATA (push, 1); break; - case PIPE_QUERY_PRIMITIVES_GENERATED: /* store before & after instead ? */ - PUSH_SPACE(push, 2); - BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1); - PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES); + case PIPE_QUERY_PRIMITIVES_GENERATED: + nv50_query_get(push, q, 0x10, 0x06805002); break; case PIPE_QUERY_PRIMITIVES_EMITTED: - PUSH_SPACE(push, 2); - BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1); - PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK); + nv50_query_get(push, q, 0x10, 0x05805002); break; case PIPE_QUERY_SO_STATISTICS: - PUSH_SPACE(push, 3); - BEGIN_NI04(push, NV50_3D(COUNTER_RESET), 2); - PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK); - PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES); + nv50_query_get(push, q, 0x20, 0x05805002); + nv50_query_get(push, q, 0x30, 0x06805002); break; case PIPE_QUERY_TIMESTAMP_DISJOINT: case PIPE_QUERY_TIME_ELAPSED: @@ -227,6 +222,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) case PIPE_QUERY_GPU_FINISHED: nv50_query_get(push, q, 0, 0x1000f010); break; + case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET: + nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5)); + break; default: assert(0); break; @@ -247,6 +245,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, struct nv50_context *nv50 = nv50_context(pipe); struct nv50_query *q = nv50_query(pq); uint64_t *res64 = (uint64_t *)result; + uint32_t *res32 = (uint32_t *)result; boolean *res8 = (boolean *)result; uint64_t *data64 = (uint64_t *)q->data; @@ -275,11 +274,11 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, break; case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ - res64[0] = data64[0]; + res64[0] = data64[0] - data64[2]; break; case PIPE_QUERY_SO_STATISTICS: - res64[0] = data64[0]; - res64[1] = data64[1]; + res64[0] = data64[0] - data64[4]; + res64[1] = data64[2] - data64[6]; break; case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */ res64[0] = 1000000000; @@ -288,6 +287,9 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, case PIPE_QUERY_TIME_ELAPSED: res64[0] = data64[1] - data64[3]; break; + case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET: + res32[0] = q->data[1]; + break; default: return FALSE; } @@ -295,6 +297,21 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, return TRUE; } +void +nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq) +{ + struct nv50_query *q = nv50_query(pq); + unsigned offset = q->offset; + + PUSH_SPACE(push, 5); + PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); + BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4); + PUSH_DATAh(push, q->bo->offset + offset); + PUSH_DATA (push, q->bo->offset + offset); + PUSH_DATA (push, q->sequence); + PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL); +} + static void nv50_render_condition(struct pipe_context *pipe, struct pipe_query *pq, uint mode) @@ -324,6 +341,38 @@ nv50_render_condition(struct pipe_context *pipe, PUSH_DATA (push, NV50_3D_COND_MODE_RES_NON_ZERO); } +void +nv50_query_pushbuf_submit(struct nouveau_pushbuf *push, + struct pipe_query *pq, unsigned result_offset) +{ + struct nv50_query *q = nv50_query(pq); + + /* XXX: does this exist ? */ +#define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8)) + + nouveau_pushbuf_space(push, 0, 0, 1); + nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 | + NV50_IB_ENTRY_1_NO_PREFETCH); +} + +void +nva0_so_target_save_offset(struct pipe_context *pipe, + struct pipe_stream_output_target *ptarg, + unsigned index, boolean serialize) +{ + struct nv50_so_target *targ = nv50_so_target(ptarg); + + if (serialize) { + struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf; + PUSH_SPACE(push, 2); + BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1); + PUSH_DATA (push, 0); + } + + nv50_query(targ->pq)->index = index; + nv50_query_end(pipe, targ->pq); +} + void nv50_init_query_functions(struct nv50_context *nv50) { diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index d72b6e91d2a..b341ade695e 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -73,6 +73,8 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, static int nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) { + const uint16_t class_3d = nouveau_screen(pscreen)->class_3d; + switch (param) { case PIPE_CAP_MAX_COMBINED_SAMPLERS: return 64; @@ -95,7 +97,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_ANISOTROPIC_FILTER: case PIPE_CAP_SCALED_RESOLVE: return 1; - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: case PIPE_CAP_SEAMLESS_CUBE_MAP: return nv50_screen(pscreen)->tesla->oclass >= NVA0_3D_CLASS; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: @@ -121,11 +122,12 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_OCCLUSION_QUERY: return 1; case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - return 0; + return 4; case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return 128; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - return 32; + return 64; + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + return (class_3d >= NVA0_3D_CLASS) ? 1 : 0; case PIPE_CAP_BLEND_EQUATION_SEPARATE: case PIPE_CAP_INDEP_BLEND_ENABLE: return 1; diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index aef3f129c81..d070f07bbbc 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -207,6 +207,8 @@ nv50_gmtyprog_validate(struct nv50_context *nv50) PUSH_DATA (push, gp->gp.vert_count); BEGIN_NV04(push, NV50_3D(GP_START_ID), 1); PUSH_DATA (push, gp->code_base); + + nv50->state.prim_size = gp->gp.prim_type; /* enum matches vertex count */ } nv50_program_update_context_state(nv50, gp, 2); @@ -278,6 +280,12 @@ nv50_validate_derived_rs(struct nv50_context *nv50) nv50_sprite_coords_validate(nv50); + if (nv50->state.rasterizer_discard != nv50->rast->pipe.rasterizer_discard) { + nv50->state.rasterizer_discard = nv50->rast->pipe.rasterizer_discard; + BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1); + PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard); + } + if (nv50->dirty & NV50_NEW_FRAGPROG) return; psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK; @@ -343,6 +351,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) uint32_t colors = fp->fp.colors; uint32_t lin[4]; uint8_t map[64]; + uint8_t so_map[64]; if (!(nv50->dirty & (NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | @@ -411,6 +420,30 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) if (nv50->rast->pipe.clamp_vertex_color) colors |= NV50_3D_SEMANTIC_COLOR_CLMP_EN; + if (unlikely(vp->so)) { + /* Slot i in STRMOUT_MAP specifies the offset where slot i in RESULT_MAP + * gets written. + * + * TODO: + * Inverting vp->so->map (output -> offset) would probably speed this up. + */ + memset(so_map, 0, sizeof(so_map)); + for (i = 0; i < vp->so->map_size; ++i) { + if (vp->so->map[i] == 0xff) + continue; + for (c = 0; c < m; ++c) + if (map[c] == vp->so->map[i] && !so_map[c]) + break; + if (c == m) { + c = m; + map[m++] = vp->so->map[i]; + } + so_map[c] = 0x80 | i; + } + for (c = m; c & 3; ++c) + so_map[c] = 0; + } + n = (m + 3) / 4; assert(m <= 64); @@ -451,6 +484,11 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) BEGIN_NV04(push, NV50_3D(GP_ENABLE), 1); PUSH_DATA (push, nv50->gmtyprog ? 1 : 0); + + if (vp->so) { + BEGIN_NV04(push, NV50_3D(STRMOUT_MAP(0)), n); + PUSH_DATAp(push, so_map, n); + } } static int @@ -509,3 +547,75 @@ nv50_gp_linkage_validate(struct nv50_context *nv50) BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n); PUSH_DATAp(push, map, n); } + +void +nv50_stream_output_validate(struct nv50_context *nv50) +{ + struct nouveau_pushbuf *push = nv50->base.pushbuf; + struct nv50_stream_output_state *so; + uint32_t ctrl; + unsigned i; + unsigned prims = ~0; + + so = nv50->gmtyprog ? nv50->gmtyprog->so : nv50->vertprog->so; + + if (!so || !nv50->num_so_targets) { + BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1); + PUSH_DATA (push, 0); + if (nv50->screen->base.class_3d < NVA0_3D_CLASS) { + BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1); + PUSH_DATA (push, 0); + } + BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1); + PUSH_DATA (push, 1); + return; + } + + ctrl = so->ctrl; + if (nv50->screen->base.class_3d >= NVA0_3D_CLASS) + ctrl |= NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET; + + BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1); + PUSH_DATA (push, ctrl); + + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO); + + for (i = 0; i < nv50->num_so_targets; ++i) { + struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]); + struct nv04_resource *buf = nv04_resource(targ->pipe.buffer); + + const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3; + + if (n == 4 && !targ->clean) + nv84_query_fifo_wait(push, targ->pq); + BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n); + PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset); + PUSH_DATA (push, buf->address + targ->pipe.buffer_offset); + PUSH_DATA (push, so->num_attribs[i]); + if (n == 4) { + PUSH_DATA(push, targ->pipe.buffer_size); + + BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1); + if (!targ->clean) { + assert(targ->pq); + nv50_query_pushbuf_submit(push, targ->pq, 0x4); + } else { + PUSH_DATA(push, 0); + targ->clean = FALSE; + } + } else { + const unsigned limit = targ->pipe.buffer_size / + (so->stride[i] * nv50->state.prim_size); + prims = MIN2(prims, limit); + } + BCTX_REFN(nv50->bufctx_3d, SO, buf, WR); + } + if (prims != ~0) { + BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1); + PUSH_DATA (push, prims); + } + BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1); + PUSH_DATA (push, 1); + BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1); + PUSH_DATA (push, 1); +} diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 5b783da7ad7..5e32b2717fd 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -680,6 +680,9 @@ nv50_sp_state_create(struct pipe_context *pipe, prog->type = type; prog->pipe.tokens = tgsi_dup_tokens(cso->tokens); + if (cso->stream_output.num_outputs) + prog->pipe.stream_output = cso->stream_output; + return (void *)prog; } @@ -909,6 +912,90 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso) nv50->dirty |= NV50_NEW_VERTEX; } +static struct pipe_stream_output_target * +nv50_so_target_create(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size) +{ + struct nv50_so_target *targ = MALLOC_STRUCT(nv50_so_target); + if (!targ) + return NULL; + + if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) { + targ->pq = pipe->create_query(pipe, + NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET); + if (!targ->pq) { + FREE(targ); + return NULL; + } + } else { + targ->pq = NULL; + } + targ->clean = TRUE; + + targ->pipe.buffer_size = size; + targ->pipe.buffer_offset = offset; + targ->pipe.context = pipe; + targ->pipe.buffer = NULL; + pipe_resource_reference(&targ->pipe.buffer, res); + pipe_reference_init(&targ->pipe.reference, 1); + + return &targ->pipe; +} + +static void +nv50_so_target_destroy(struct pipe_context *pipe, + struct pipe_stream_output_target *ptarg) +{ + struct nv50_so_target *targ = nv50_so_target(ptarg); + if (targ->pq) + pipe->destroy_query(pipe, targ->pq); + FREE(targ); +} + +static void +nv50_set_stream_output_targets(struct pipe_context *pipe, + unsigned num_targets, + struct pipe_stream_output_target **targets, + unsigned append_mask) +{ + struct nv50_context *nv50 = nv50_context(pipe); + unsigned i; + boolean serialize = TRUE; + const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS; + + assert(num_targets <= 4); + + for (i = 0; i < num_targets; ++i) { + const boolean changed = nv50->so_target[i] != targets[i]; + if (!changed && (append_mask & (1 << i))) + continue; + nv50->so_targets_dirty |= 1 << i; + + if (can_resume && changed && nv50->so_target[i]) { + nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize); + serialize = FALSE; + } + + if (targets[i] && !(append_mask & (1 << i))) + nv50_so_target(targets[i])->clean = TRUE; + + pipe_so_target_reference(&nv50->so_target[i], targets[i]); + } + for (; i < nv50->num_so_targets; ++i) { + if (can_resume && nv50->so_target[i]) { + nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize); + serialize = FALSE; + } + pipe_so_target_reference(&nv50->so_target[i], NULL); + nv50->so_targets_dirty |= 1 << i; + } + nv50->num_so_targets = num_targets; + + if (nv50->so_targets_dirty) + nv50->dirty |= NV50_NEW_STRMOUT; +} + void nv50_init_state_functions(struct nv50_context *nv50) { @@ -965,6 +1052,10 @@ nv50_init_state_functions(struct nv50_context *nv50) pipe->set_vertex_buffers = nv50_set_vertex_buffers; pipe->set_index_buffer = nv50_set_index_buffer; + pipe->create_stream_output_target = nv50_so_target_create; + pipe->stream_output_target_destroy = nv50_so_target_destroy; + pipe->set_stream_output_targets = nv50_set_stream_output_targets; + pipe->redefine_user_buffer = u_default_redefine_user_buffer; } diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index c19acf6c426..a95e96d3c51 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -360,6 +360,8 @@ static struct state_validate { { nv50_constbufs_validate, NV50_NEW_CONSTBUF }, { nv50_validate_textures, NV50_NEW_TEXTURES }, { nv50_validate_samplers, NV50_NEW_SAMPLERS }, + { nv50_stream_output_validate, NV50_NEW_STRMOUT | + NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS } }; #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0])) diff --git a/src/gallium/drivers/nv50/nv50_stateobj.h b/src/gallium/drivers/nv50/nv50_stateobj.h index 188406da600..8a9260c937e 100644 --- a/src/gallium/drivers/nv50/nv50_stateobj.h +++ b/src/gallium/drivers/nv50/nv50_stateobj.h @@ -51,4 +51,17 @@ struct nv50_vertex_stateobj { struct nv50_vertex_element element[0]; }; +struct nv50_so_target { + struct pipe_stream_output_target pipe; + struct pipe_query *pq; + unsigned stride; + boolean clean; +}; + +static INLINE struct nv50_so_target * +nv50_so_target(struct pipe_stream_output_target *ptarg) +{ + return (struct nv50_so_target *)ptarg; +} + #endif diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index bc01e69decf..323677eaf80 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -405,6 +405,25 @@ nv50_prim_gl(unsigned prim) } } +/* For pre-nva0 transform feedback. */ +static const uint8_t nv50_pipe_prim_to_prim_size[PIPE_PRIM_MAX + 1] = +{ + [PIPE_PRIM_POINTS] = 1, + [PIPE_PRIM_LINES] = 2, + [PIPE_PRIM_LINE_LOOP] = 2, + [PIPE_PRIM_LINE_STRIP] = 2, + [PIPE_PRIM_TRIANGLES] = 3, + [PIPE_PRIM_TRIANGLE_STRIP] = 3, + [PIPE_PRIM_TRIANGLE_FAN] = 3, + [PIPE_PRIM_QUADS] = 3, + [PIPE_PRIM_QUAD_STRIP] = 3, + [PIPE_PRIM_POLYGON] = 3, + [PIPE_PRIM_LINES_ADJACENCY] = 2, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = 2, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = 3, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = 3 +}; + static void nv50_draw_arrays(struct nv50_context *nv50, unsigned mode, unsigned start, unsigned count, @@ -623,6 +642,51 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten, } } +static void +nva0_draw_stream_output(struct nv50_context *nv50, + const struct pipe_draw_info *info) +{ + struct nouveau_pushbuf *push = nv50->base.pushbuf; + struct nv50_so_target *so = nv50_so_target(info->count_from_stream_output); + struct nv04_resource *res = nv04_resource(so->pipe.buffer); + unsigned num_instances = info->instance_count; + unsigned mode = nv50_prim_gl(info->mode); + + if (unlikely(nv50->screen->base.class_3d < NVA0_3D_CLASS)) { + /* A proper implementation without waiting doesn't seem possible, + * so don't bother. + */ + NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n"); + return; + } + + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; + PUSH_SPACE(push, 4); + BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1); + PUSH_DATA (push, 0); + BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1); + PUSH_DATA (push, 0); + } + + assert(num_instances); + do { + PUSH_SPACE(push, 8); + BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1); + PUSH_DATA (push, mode); + BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BASE), 1); + PUSH_DATA (push, 0); + BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1); + PUSH_DATA (push, 0); + BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BYTES), 1); + nv50_query_pushbuf_submit(push, so->pq, 0x4); + BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1); + PUSH_DATA (push, 0); + + mode |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT; + } while (--num_instances); +} + static void nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan) { @@ -655,6 +719,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_VERTEX | NV50_NEW_ARRAYS))) nv50_update_user_vbufs(nv50); + if (unlikely(nv50->num_so_targets && !nv50->gmtyprog)) + nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode]; + nv50_state_validate(nv50, ~0, 8); /* 8 as minimum, we use flush_notify */ push->kick_notify = nv50_draw_vbo_kick_notify; @@ -679,11 +746,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) nv50->base.vbo_dirty = FALSE; } - if (!info->indexed) { - nv50_draw_arrays(nv50, - info->mode, info->start, info->count, - info->instance_count); - } else { + if (info->indexed) { boolean shorten = info->max_index <= 65535; assert(nv50->idxbuf.buffer); @@ -713,6 +776,13 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) nv50_draw_elements(nv50, shorten, info->mode, info->start, info->count, info->instance_count, info->index_bias); + } else + if (unlikely(info->count_from_stream_output)) { + nva0_draw_stream_output(nv50, info); + } else { + nv50_draw_arrays(nv50, + info->mode, info->start, info->count, + info->instance_count); } push->kick_notify = nv50_default_kick_notify; diff --git a/src/gallium/drivers/nv50/nv50_winsys.h b/src/gallium/drivers/nv50/nv50_winsys.h index b36898dabe6..145ee70cb9f 100644 --- a/src/gallium/drivers/nv50/nv50_winsys.h +++ b/src/gallium/drivers/nv50/nv50_winsys.h @@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define SUBC_3D(m) 3, (m) #define NV50_3D(n) SUBC_3D(NV50_3D_##n) +#define NVA0_3D(n) SUBC_3D(NVA0_3D_##n) #define SUBC_2D(m) 4, (m) #define NV50_2D(n) SUBC_2D(NV50_2D_##n)