X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state_streamout.c;h=85ac4a119c50eb62d5f83cb397859095117a43d9;hb=d7c86b106ca5d2f2229f27e66d76cd24ccf81a82;hp=e7058f19a8a5eaba4145a865ce8057e0eabc4642;hpb=ccfcb9d818b40564001b3cf2516367526de26c1d;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index e7058f19a8a..85ac4a119c5 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -50,7 +50,8 @@ si_create_so_target(struct pipe_context *ctx, return NULL; } - u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, + unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; + u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4, &t->buf_filled_size_offset, (struct pipe_resource**)&t->buf_filled_size); if (!t->buf_filled_size) { @@ -64,7 +65,7 @@ si_create_so_target(struct pipe_context *ctx, t->b.buffer_offset = buffer_offset; t->b.buffer_size = buffer_size; - util_range_add(&buf->valid_buffer_range, buffer_offset, + util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); return &t->b; } @@ -95,6 +96,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, struct si_context *sctx = (struct si_context *)ctx; unsigned old_num_targets = sctx->streamout.num_targets; unsigned i; + bool wait_now = false; /* We are going to unbind the buffers. Mark which caches need to be flushed. */ if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { @@ -121,17 +123,35 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * VS_PARTIAL_FLUSH is required if the buffers are going to be * used as an input immediately. */ - sctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_VS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE; + + /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ + if (sctx->screen->use_ngg_streamout) { + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + + /* Wait now. This is needed to make sure that GDS is not + * busy at the end of IBs. + * + * Also, the next streamout operation will overwrite GDS, + * so we need to make sure that it's idle. + */ + wait_now = true; + } else { + sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; + } } /* All readers of the streamout targets need to be finished before we can * start writing to the targets. */ - if (num_targets) + if (num_targets) { + if (sctx->screen->use_ngg_streamout) + si_allocate_gds(sctx); + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; + SI_CONTEXT_CS_PARTIAL_FLUSH; + } /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers @@ -176,9 +196,16 @@ static void si_set_streamout_targets(struct pipe_context *ctx, if (targets[i]) { struct pipe_shader_buffer sbuf; sbuf.buffer = targets[i]->buffer; - sbuf.buffer_offset = 0; - sbuf.buffer_size = targets[i]->buffer_offset + - targets[i]->buffer_size; + + if (sctx->screen->use_ngg_streamout) { + sbuf.buffer_offset = targets[i]->buffer_offset; + sbuf.buffer_size = targets[i]->buffer_size; + } else { + sbuf.buffer_offset = 0; + sbuf.buffer_size = targets[i]->buffer_offset + + targets[i]->buffer_size; + } + si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; } else { @@ -187,6 +214,77 @@ static void si_set_streamout_targets(struct pipe_context *ctx, } for (; i < old_num_targets; i++) si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); + + if (wait_now) + sctx->emit_cache_flush(sctx); +} + +static void gfx10_emit_streamout_begin(struct si_context *sctx) +{ + struct si_streamout_target **t = sctx->streamout.targets; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned last_target = 0; + + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (t[i]) + last_target = i; + } + + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; + + bool append = sctx->streamout.append_bitmask & (1 << i); + uint64_t va = 0; + + if (append) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + t[i]->buf_filled_size, + RADEON_USAGE_READ, + RADEON_PRIO_SO_FILLED_SIZE); + + va = t[i]->buf_filled_size->gpu_address + + t[i]->buf_filled_size_offset; + } + + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | + S_411_DST_SEL(V_411_GDS) | + S_411_CP_SYNC(i == last_target)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 4 * i); /* destination in GDS */ + radeon_emit(cs, 0); + radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | + S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target)); + } + + sctx->streamout.begin_emitted = true; +} + +static void gfx10_emit_streamout_end(struct si_context *sctx) +{ + struct si_streamout_target **t = sctx->streamout.targets; + + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + + si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, + EOP_DATA_SEL_GDS, + t[i]->buf_filled_size, va, + EOP_DATA_GDS(i, 1), 0); + + t[i]->buf_filled_size_valid = true; + } + + sctx->streamout.begin_emitted = false; } static void si_flush_vgt_streamout(struct si_context *sctx) @@ -272,6 +370,11 @@ static void si_emit_streamout_begin(struct si_context *sctx) void si_emit_streamout_end(struct si_context *sctx) { + if (sctx->screen->use_ngg_streamout) { + gfx10_emit_streamout_end(sctx); + return; + } + struct radeon_cmdbuf *cs = sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; unsigned i; @@ -320,6 +423,8 @@ void si_emit_streamout_end(struct si_context *sctx) static void si_emit_streamout_enable(struct si_context *sctx) { + assert(!sctx->screen->use_ngg_streamout); + radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | @@ -344,15 +449,17 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); - if ((old_strmout_en != si_get_strmout_en(sctx)) || - (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)) + if (!sctx->screen->use_ngg_streamout && + ((old_strmout_en != si_get_strmout_en(sctx)) || + (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); } void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) { - if (type == PIPE_QUERY_PRIMITIVES_GENERATED) { + if (!sctx->screen->use_ngg_streamout && + type == PIPE_QUERY_PRIMITIVES_GENERATED) { bool old_strmout_en = si_get_strmout_en(sctx); sctx->streamout.num_prims_gen_queries += diff; @@ -363,6 +470,11 @@ void si_update_prims_generated_query_state(struct si_context *sctx, if (old_strmout_en != si_get_strmout_en(sctx)) si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); + + if (si_update_ngg(sctx)) { + si_shader_change_notify(sctx); + sctx->do_update_shaders = true; + } } } @@ -371,6 +483,11 @@ void si_init_streamout_functions(struct si_context *sctx) sctx->b.create_stream_output_target = si_create_so_target; sctx->b.stream_output_target_destroy = si_so_target_destroy; sctx->b.set_stream_output_targets = si_set_streamout_targets; - sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; - sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; + + if (sctx->screen->use_ngg_streamout) { + sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; + } else { + sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; + sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; + } }