From c8e70e64accc914c58533b8336873e0995e901e7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 6 Aug 2013 06:42:22 +0200 Subject: [PATCH] radeonsi: add flexible shader descriptor management and use it for sampler views It moves all sampler view descriptors to a buffer. It supports partial resource updates and it can also unbind resources (required for FMASK texturing). The buffer contains all sampler view descriptors for one shader stage, represented as an array. On top of that, there are N arrays in the buffer, which are used to emulate context registers as implemented by the previous ASICs (each array is a context). This uses the RCU synchronization approach to avoid read-after-write hazards as discussed in the thread: "radeonsi: add FMASK texture binding slots and resource setup" CP DMA is used to clear the descriptors at context initialization and to copy the descriptors from one context to the next. v2: - use PKT3_DMA_DATA on CIK (I'll test CIK later) - turn the bool CP DMA parameters into self-explanatory flags - add a nice simple API for packet emission to radeon_winsys.h - use 256 contexts, 128 causes texture corruption in openarena --- src/gallium/drivers/radeonsi/Makefile.sources | 1 + src/gallium/drivers/radeonsi/r600_blit.c | 12 +- .../drivers/radeonsi/r600_hw_context.c | 22 +- src/gallium/drivers/radeonsi/radeonsi_pipe.c | 7 +- src/gallium/drivers/radeonsi/radeonsi_pipe.h | 19 +- src/gallium/drivers/radeonsi/si_descriptors.c | 355 ++++++++++++++++++ src/gallium/drivers/radeonsi/si_state.c | 47 +-- src/gallium/drivers/radeonsi/si_state.h | 56 +++ src/gallium/drivers/radeonsi/si_state_draw.c | 18 +- src/gallium/drivers/radeonsi/sid.h | 54 +++ src/gallium/winsys/radeon/drm/radeon_winsys.h | 12 + 11 files changed, 547 insertions(+), 56 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_descriptors.c diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index b3ffa727eb5..68c82820015 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -10,6 +10,7 @@ C_SOURCES := \ r600_translate.c \ radeonsi_pm4.c \ radeonsi_compute.c \ + si_descriptors.c \ si_state.c \ si_state_streamout.c \ si_state_draw.c \ diff --git a/src/gallium/drivers/radeonsi/r600_blit.c b/src/gallium/drivers/radeonsi/r600_blit.c index bab108e7c91..bdd9bb43c10 100644 --- a/src/gallium/drivers/radeonsi/r600_blit.c +++ b/src/gallium/drivers/radeonsi/r600_blit.c @@ -70,12 +70,12 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op if (op & R600_SAVE_TEXTURES) { util_blitter_save_fragment_sampler_states( - rctx->blitter, rctx->ps_samplers.n_samplers, - (void**)rctx->ps_samplers.samplers); + rctx->blitter, rctx->samplers[PIPE_SHADER_FRAGMENT].n_samplers, + (void**)rctx->samplers[PIPE_SHADER_FRAGMENT].samplers); - util_blitter_save_fragment_sampler_views( - rctx->blitter, rctx->ps_samplers.n_views, - (struct pipe_sampler_view**)rctx->ps_samplers.views); + util_blitter_save_fragment_sampler_views(rctx->blitter, + util_last_bit(rctx->samplers[PIPE_SHADER_FRAGMENT].views.desc.enabled_mask), + rctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) { @@ -224,7 +224,7 @@ void si_flush_depth_textures(struct r600_context *rctx, struct pipe_sampler_view *view; struct r600_texture *tex; - view = &textures->views[i]->base; + view = textures->views.views[i]; if (!view) continue; tex = (struct r600_texture *)view->texture; diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c index 25c972bbf62..bc6ba0bd1f0 100644 --- a/src/gallium/drivers/radeonsi/r600_hw_context.c +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c @@ -114,9 +114,17 @@ err: void si_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in) { + int i; + /* The number of dwords we already used in the CS so far. */ num_dw += ctx->cs->cdw; + for (i = 0; i < SI_NUM_ATOMS(ctx); i++) { + if (ctx->atoms.array[i]->dirty) { + num_dw += ctx->atoms.array[i]->num_dw; + } + } + if (count_draw_in) { /* The number of dwords all the dirty states would take. */ num_dw += ctx->pm4_dirty_cdwords; @@ -254,6 +262,15 @@ void si_context_flush(struct r600_context *ctx, unsigned flags) ctx->pm4_dirty_cdwords = 0; ctx->flags = 0; + /* set all valid group as dirty so they get reemited on + * next draw command + */ + si_pm4_reset_emitted(ctx); + + /* The CS initialization should be emitted before everything else. */ + si_pm4_emit(ctx, ctx->queued.named.init); + ctx->emitted.named.init = ctx->queued.named.init; + #if 0 if (streamout_suspended) { ctx->streamout_start = TRUE; @@ -266,10 +283,7 @@ void si_context_flush(struct r600_context *ctx, unsigned flags) r600_context_queries_resume(ctx); } - /* set all valid group as dirty so they get reemited on - * next draw command - */ - si_pm4_reset_emitted(ctx); + si_all_descriptors_begin_new_cs(ctx); } void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value) diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c index b4a1ca93fe6..9afc7f2714b 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c @@ -178,6 +178,8 @@ static void r600_destroy_context(struct pipe_context *context) { struct r600_context *rctx = (struct r600_context *)context; + si_release_all_descriptors(rctx); + si_resource_reference(&rctx->border_color_table, NULL); if (rctx->dummy_pixel_shader) { @@ -231,12 +233,15 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void rctx->context.create_video_buffer = vl_video_buffer_create; } + rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL); + + si_init_all_descriptors(rctx); + switch (rctx->chip_class) { case SI: case CIK: si_init_state_functions(rctx); LIST_INITHEAD(&rctx->active_query_list); - rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL); rctx->max_db = 8; si_init_config(rctx); break; diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h index 6fbe6539d87..674c6303b7a 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h @@ -94,11 +94,8 @@ struct si_cs_shader_state { struct si_pipe_compute *program; }; -/* needed for blitter save */ -#define NUM_TEX_UNITS 16 - struct r600_textures_info { - struct si_pipe_sampler_view *views[NUM_TEX_UNITS]; + struct si_sampler_views views; struct si_pipe_sampler_state *samplers[NUM_TEX_UNITS]; unsigned n_views; uint32_t depth_texture_mask; /* which textures are depth */ @@ -131,6 +128,9 @@ struct r600_constbuf_state uint32_t dirty_mask; }; +#define SI_NUM_ATOMS(rctx) (sizeof((rctx)->atoms)/sizeof((rctx)->atoms.array[0])) +#define SI_NUM_SHADERS (PIPE_SHADER_FRAGMENT+1) + struct r600_context { struct pipe_context context; struct blitter_context *blitter; @@ -142,6 +142,14 @@ struct r600_context { void *custom_dsa_flush_inplace; struct r600_screen *screen; struct radeon_winsys *ws; + + union { + struct { + struct si_atom *sampler_views[SI_NUM_SHADERS]; + }; + struct si_atom *array[0]; + } atoms; + struct si_vertex_element *vertex_elements; struct pipe_framebuffer_state framebuffer; unsigned pa_sc_line_stipple; @@ -161,8 +169,7 @@ struct r600_context { unsigned sprite_coord_enable; unsigned export_16bpc; struct r600_constbuf_state constbuf_state[PIPE_SHADER_TYPES]; - struct r600_textures_info vs_samplers; - struct r600_textures_info ps_samplers; + struct r600_textures_info samplers[SI_NUM_SHADERS]; struct si_resource *border_color_table; unsigned border_color_offset; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c new file mode 100644 index 00000000000..f05c8f490bb --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -0,0 +1,355 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Marek Olšák + */ + +#include "radeonsi_pipe.h" +#include "radeonsi_resource.h" +#include "radeonsi_shader.h" +#include "r600_hw_context_priv.h" + +#include "util/u_memory.h" + +#define SI_NUM_CONTEXTS 256 + +static const uint32_t null_desc[8]; /* zeros */ + +/* Set this if you want the 3D engine to wait until CP DMA is done. + * It should be set on the last CP DMA packet. */ +#define R600_CP_DMA_SYNC (1 << 0) /* R600+ */ + +/* Set this if the source data was used as a destination in a previous CP DMA + * packet. It's for preventing a read-after-write (RAW) hazard between two + * CP DMA packets. */ +#define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */ + +/* Emit a CP DMA packet to do a copy from one buffer to another. + * The size must fit in bits [20:0]. Notes: + */ +static void si_emit_cp_dma_copy_buffer(struct r600_context *rctx, + uint64_t dst_va, uint64_t src_va, + unsigned size, unsigned flags) +{ + struct radeon_winsys_cs *cs = rctx->cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + if (rctx->chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag); /* CP_SYNC [31] */ + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } +} + +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */ +static void si_emit_cp_dma_clear_buffer(struct r600_context *rctx, + uint64_t dst_va, unsigned size, + uint32_t clear_value, unsigned flags) +{ + struct radeon_winsys_cs *cs = rctx->cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + if (rctx->chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, 0); + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } +} + +static void si_init_descriptors(struct r600_context *rctx, + struct si_descriptors *desc, + unsigned shader_userdata_reg, + unsigned element_dw_size, + unsigned num_elements, + void (*emit_func)(struct r600_context *ctx, struct si_atom *state)) +{ + uint64_t va; + + desc->atom.emit = emit_func; + desc->shader_userdata_reg = shader_userdata_reg; + desc->element_dw_size = element_dw_size; + desc->num_elements = num_elements; + desc->context_size = num_elements * element_dw_size * 4; + + desc->buffer = (struct si_resource*) + pipe_buffer_create(rctx->context.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_STATIC, + SI_NUM_CONTEXTS * desc->context_size); + + r600_context_bo_reloc(rctx, desc->buffer, RADEON_USAGE_READWRITE); + va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b); + + /* We don't check for CS space here, because this should be called + * only once at context initialization. */ + si_emit_cp_dma_clear_buffer(rctx, va, desc->buffer->b.b.width0, 0, + R600_CP_DMA_SYNC); +} + +static void si_release_descriptors(struct si_descriptors *desc) +{ + pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL); +} + +static void si_update_descriptors(struct si_descriptors *desc) +{ + if (desc->dirty_mask) { + desc->atom.num_dw = + 7 + /* copy */ + (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */ + 4; /* pointer update */ + desc->atom.dirty = true; + } else { + desc->atom.dirty = false; + } +} + +static void si_emit_shader_pointer(struct r600_context *rctx, + struct si_descriptors *desc) +{ + struct radeon_winsys_cs *cs = rctx->cs; + uint64_t va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b) + + desc->current_context_id * desc->context_size; + + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); + radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); +} + +static void si_emit_descriptors(struct r600_context *rctx, + struct si_descriptors *desc, + const uint32_t **descriptors) +{ + struct radeon_winsys_cs *cs = rctx->cs; + uint64_t va_base; + int packet_start; + int packet_size = 0; + int last_index = desc->num_elements; /* point to a non-existing element */ + unsigned dirty_mask = desc->dirty_mask; + unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS; + + assert(dirty_mask); + + va_base = r600_resource_va(rctx->context.screen, &desc->buffer->b.b); + + /* Copy the descriptors to a new context slot. */ + si_emit_cp_dma_copy_buffer(rctx, + va_base + new_context_id * desc->context_size, + va_base + desc->current_context_id * desc->context_size, + desc->context_size, R600_CP_DMA_SYNC); + + va_base += new_context_id * desc->context_size; + + /* Update the descriptors. + * Updates of consecutive descriptors are merged to one WRITE_DATA packet. + * + * XXX When unbinding lots of resources, consider clearing the memory + * with CP DMA instead of emitting zeros. + */ + while (dirty_mask) { + int i = u_bit_scan(&dirty_mask); + + assert(i < desc->num_elements); + + if (last_index+1 == i && packet_size) { + /* Append new data at the end of the last packet. */ + packet_size += desc->element_dw_size; + cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0); + } else { + /* Start a new packet. */ + uint64_t va = va_base + i * desc->element_dw_size * 4; + + packet_start = cs->cdw; + packet_size = 2 + desc->element_dw_size; + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0)); + radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | + PKT3_WRITE_DATA_WR_CONFIRM | + PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); + radeon_emit(cs, va & 0xFFFFFFFFUL); + radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL); + } + + radeon_emit_array(cs, descriptors[i], desc->element_dw_size); + + last_index = i; + } + + desc->dirty_mask = 0; + desc->current_context_id = new_context_id; + + /* Now update the shader userdata pointer. */ + si_emit_shader_pointer(rctx, desc); +} + +static unsigned si_get_shader_user_data_base(unsigned shader) +{ + switch (shader) { + case PIPE_SHADER_VERTEX: + return R_00B130_SPI_SHADER_USER_DATA_VS_0; + case PIPE_SHADER_GEOMETRY: + return R_00B230_SPI_SHADER_USER_DATA_GS_0; + case PIPE_SHADER_FRAGMENT: + return R_00B030_SPI_SHADER_USER_DATA_PS_0; + default: + assert(0); + return 0; + } +} + +/* SAMPLER VIEWS */ + +static void si_emit_sampler_views(struct r600_context *rctx, struct si_atom *atom) +{ + struct si_sampler_views *views = (struct si_sampler_views*)atom; + + si_emit_descriptors(rctx, &views->desc, views->desc_data); +} + +static void si_init_sampler_views(struct r600_context *rctx, + struct si_sampler_views *views, + unsigned shader) +{ + si_init_descriptors(rctx, &views->desc, + si_get_shader_user_data_base(shader) + + SI_SGPR_RESOURCE * 4, + 8, 16, si_emit_sampler_views); +} + +static void si_release_sampler_views(struct si_sampler_views *views) +{ + int i; + + for (i = 0; i < Elements(views->views); i++) { + pipe_sampler_view_reference(&views->views[i], NULL); + } + si_release_descriptors(&views->desc); +} + +static void si_sampler_views_begin_new_cs(struct r600_context *rctx, + struct si_sampler_views *views) +{ + unsigned mask = views->desc.enabled_mask; + + /* Add relocations to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct si_pipe_sampler_view *rview = + (struct si_pipe_sampler_view*)views->views[i]; + + r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ); + } + + r600_context_bo_reloc(rctx, views->desc.buffer, RADEON_USAGE_READWRITE); + + si_emit_shader_pointer(rctx, &views->desc); +} + +void si_set_sampler_view(struct r600_context *rctx, unsigned shader, + unsigned slot, struct pipe_sampler_view *view, + unsigned *view_desc) +{ + struct si_sampler_views *views = &rctx->samplers[shader].views; + + if (views->views[slot] == view) + return; + + if (view) { + struct si_pipe_sampler_view *rview = + (struct si_pipe_sampler_view*)view; + + r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ); + + pipe_sampler_view_reference(&views->views[slot], view); + views->desc_data[slot] = view_desc; + views->desc.enabled_mask |= 1 << slot; + } else { + pipe_sampler_view_reference(&views->views[slot], NULL); + views->desc_data[slot] = null_desc; + views->desc.enabled_mask &= ~(1 << slot); + } + + views->desc.dirty_mask |= 1 << slot; + si_update_descriptors(&views->desc); +} + +/* INIT/DEINIT */ + +void si_init_all_descriptors(struct r600_context *rctx) +{ + int i; + + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_init_sampler_views(rctx, &rctx->samplers[i].views, i); + + rctx->atoms.sampler_views[i] = &rctx->samplers[i].views.desc.atom; + } +} + +void si_release_all_descriptors(struct r600_context *rctx) +{ + int i; + + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_release_sampler_views(&rctx->samplers[i].views); + } +} + +void si_all_descriptors_begin_new_cs(struct r600_context *rctx) +{ + int i; + + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_sampler_views_begin_new_cs(rctx, &rctx->samplers[i].views); + } +} diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index d1e3c9d5279..7d637e75189 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2493,26 +2493,17 @@ static void *si_create_sampler_state(struct pipe_context *ctx, } static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx, - unsigned count, - struct pipe_sampler_view **views, - struct r600_textures_info *samplers, - unsigned user_data_reg) + unsigned shader, unsigned count, + struct pipe_sampler_view **views) { - struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views; + struct r600_textures_info *samplers = &rctx->samplers[shader]; + struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views; struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx); - int i, j; - - if (!count) - goto out; + int i; si_pm4_inval_texture_cache(pm4); - si_pm4_sh_data_begin(pm4); for (i = 0; i < count; i++) { - pipe_sampler_view_reference( - (struct pipe_sampler_view **)&samplers->views[i], - views[i]); - if (views[i]) { struct r600_texture *rtex = (struct r600_texture*)views[i]->texture; @@ -2523,25 +2514,17 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx, samplers->depth_texture_mask &= ~(1 << i); } - si_pm4_add_bo(pm4, resource[i]->resource, RADEON_USAGE_READ); + si_set_sampler_view(rctx, shader, i, views[i], rviews[i]->state); } else { samplers->depth_texture_mask &= ~(1 << i); - } - - for (j = 0; j < Elements(resource[i]->state); ++j) { - si_pm4_sh_data_add(pm4, resource[i] ? resource[i]->state[j] : 0); + si_set_sampler_view(rctx, shader, i, NULL, NULL); } } - - for (i = count; i < NUM_TEX_UNITS; i++) { - if (samplers->views[i]) - pipe_sampler_view_reference((struct pipe_sampler_view **)&samplers->views[i], NULL); + for (; i < samplers->n_views; i++) { + si_set_sampler_view(rctx, shader, i, NULL, NULL); } - si_pm4_sh_data_end(pm4, user_data_reg, SI_SGPR_RESOURCE); - -out: - rctx->ps_samplers.n_views = count; + samplers->n_views = count; return pm4; } @@ -2551,8 +2534,7 @@ static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count, struct r600_context *rctx = (struct r600_context *)ctx; struct si_pm4_state *pm4; - pm4 = si_set_sampler_views(rctx, count, views, &rctx->vs_samplers, - R_00B130_SPI_SHADER_USER_DATA_VS_0); + pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views); si_pm4_set_state(rctx, vs_sampler_views, pm4); } @@ -2562,8 +2544,7 @@ static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count, struct r600_context *rctx = (struct r600_context *)ctx; struct si_pm4_state *pm4; - pm4 = si_set_sampler_views(rctx, count, views, &rctx->ps_samplers, - R_00B030_SPI_SHADER_USER_DATA_PS_0); + pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views); si_pm4_set_state(rctx, ps_sampler_views, pm4); } @@ -2646,7 +2627,7 @@ static void si_bind_vs_sampler_states(struct pipe_context *ctx, unsigned count, struct r600_context *rctx = (struct r600_context *)ctx; struct si_pm4_state *pm4; - pm4 = si_bind_sampler_states(rctx, count, states, &rctx->vs_samplers, + pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_VERTEX], R_00B130_SPI_SHADER_USER_DATA_VS_0); si_pm4_set_state(rctx, vs_sampler, pm4); } @@ -2656,7 +2637,7 @@ static void si_bind_ps_sampler_states(struct pipe_context *ctx, unsigned count, struct r600_context *rctx = (struct r600_context *)ctx; struct si_pm4_state *pm4; - pm4 = si_bind_sampler_states(rctx, count, states, &rctx->ps_samplers, + pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_FRAGMENT], R_00B030_SPI_SHADER_USER_DATA_PS_0); si_pm4_set_state(rctx, ps_sampler, pm4); } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 7ce084e5794..610303bb9a5 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -29,6 +29,14 @@ #include "radeonsi_pm4.h" +/* This encapsulates a state or an operation which can emitted into the GPU + * command stream. */ +struct si_atom { + void (*emit)(struct r600_context *ctx, struct si_atom *state); + unsigned num_dw; + bool dirty; +}; + struct si_state_blend { struct si_pm4_state pm4; uint32_t cb_target_mask; @@ -103,6 +111,46 @@ union si_state { struct si_pm4_state *array[0]; }; +#define NUM_TEX_UNITS 16 + +/* This represents resource descriptors in memory, such as buffer resources, + * image resources, and sampler states. + */ +struct si_descriptors { + struct si_atom atom; + + /* The size of one resource descriptor. */ + unsigned element_dw_size; + /* The maximum number of resource descriptors. */ + unsigned num_elements; + + /* The buffer where resource descriptors are stored. */ + struct si_resource *buffer; + + /* The i-th bit is set if that element is dirty (changed but not emitted). */ + unsigned dirty_mask; + /* The i-th bit is set if that element is enabled (non-NULL resource). */ + unsigned enabled_mask; + + /* We can't update descriptors directly because the GPU might be + * reading them at the same time, so we have to update them + * in a copy-on-write manner. Each such copy is called a context, + * which is just another array descriptors in the same buffer. */ + unsigned current_context_id; + /* The size of a context, should be equal to 4*element_dw_size*num_elements. */ + unsigned context_size; + + /* The shader userdata register where the 64-bit pointer to the descriptor + * array will be stored. */ + unsigned shader_userdata_reg; +}; + +struct si_sampler_views { + struct si_descriptors desc; + struct pipe_sampler_view *views[NUM_TEX_UNITS]; + const uint32_t *desc_data[NUM_TEX_UNITS]; +}; + #define si_pm4_block_idx(member) \ (offsetof(union si_state, named.member) / sizeof(struct si_pm4_state *)) @@ -133,6 +181,14 @@ union si_state { } \ } while(0) +/* si_descriptors.c */ +void si_set_sampler_view(struct r600_context *rctx, unsigned shader, + unsigned slot, struct pipe_sampler_view *view, + unsigned *view_desc); +void si_init_all_descriptors(struct r600_context *rctx); +void si_release_all_descriptors(struct r600_context *rctx); +void si_all_descriptors_begin_new_cs(struct r600_context *rctx); + /* si_state.c */ struct si_pipe_shader_selector; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 47e64d8634e..f03b34f4039 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -412,11 +412,10 @@ static void si_update_derived_state(struct r600_context *rctx) if (!rctx->blitter->running) { /* Flush depth textures which need to be flushed. */ - if (rctx->vs_samplers.depth_texture_mask) { - si_flush_depth_textures(rctx, &rctx->vs_samplers); - } - if (rctx->ps_samplers.depth_texture_mask) { - si_flush_depth_textures(rctx, &rctx->ps_samplers); + for (int i = 0; i < SI_NUM_SHADERS; i++) { + if (rctx->samplers[i].depth_texture_mask) { + si_flush_depth_textures(rctx, &rctx->samplers[i]); + } } } @@ -651,7 +650,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct r600_context *rctx = (struct r600_context *)ctx; struct pipe_index_buffer ib = {}; - uint32_t cp_coher_cntl; + uint32_t cp_coher_cntl, i; if (!info->count && (info->indexed || !info->count_from_stream_output)) return; @@ -704,6 +703,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_need_cs_space(rctx, 0, TRUE); + for (i = 0; i < SI_NUM_ATOMS(rctx); i++) { + if (rctx->atoms.array[i]->dirty) { + rctx->atoms.array[i]->emit(rctx, rctx->atoms.array[i]); + rctx->atoms.array[i]->dirty = false; + } + } + si_pm4_emit_dirty(rctx); rctx->pm4_dirty_cdwords = 0; diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 208d3a88da0..57ce72e0628 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -134,6 +134,60 @@ #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate)) +#define PKT3_CP_DMA 0x41 +/* 1. header + * 2. SRC_ADDR_LO [31:0] or DATA [31:0] + * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0] + * 4. DST_ADDR_LO [31:0] + * 5. DST_ADDR_HI [15:0] + * 6. COMMAND [29:22] | BYTE_COUNT [20:0] + */ +#define PKT3_CP_DMA_CP_SYNC (1 << 31) +#define PKT3_CP_DMA_SRC_SEL(x) ((x) << 29) +/* 0 - SRC_ADDR + * 1 - GDS (program SAS to 1 as well) + * 2 - DATA + */ +#define PKT3_CP_DMA_DST_SEL(x) ((x) << 20) +/* 0 - DST_ADDR + * 1 - GDS (program DAS to 1 as well) + */ +/* COMMAND */ +#define PKT3_CP_DMA_CMD_SRC_SWAP(x) ((x) << 23) +/* 0 - none + * 1 - 8 in 16 + * 2 - 8 in 32 + * 3 - 8 in 64 + */ +#define PKT3_CP_DMA_CMD_DST_SWAP(x) ((x) << 24) +/* 0 - none + * 1 - 8 in 16 + * 2 - 8 in 32 + * 3 - 8 in 64 + */ +#define PKT3_CP_DMA_CMD_SAS (1 << 26) +/* 0 - memory + * 1 - register + */ +#define PKT3_CP_DMA_CMD_DAS (1 << 27) +/* 0 - memory + * 1 - register + */ +#define PKT3_CP_DMA_CMD_SAIC (1 << 28) +#define PKT3_CP_DMA_CMD_DAIC (1 << 29) +#define PKT3_CP_DMA_CMD_RAW_WAIT (1 << 30) + +#define PKT3_DMA_DATA 0x50 /* new for CIK */ +/* 1. header + * 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0] + * 2. SRC_ADDR_LO [31:0] or DATA [31:0] + * 3. SRC_ADDR_HI [31:0] + * 4. DST_ADDR_LO [31:0] + * 5. DST_ADDR_HI [31:0] + * 6. COMMAND [29:22] | BYTE_COUNT [20:0] + */ + + #define R_0084FC_CP_STRMOUT_CNTL 0x0084FC #define S_0084FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) #define R_0085F0_CP_COHER_CNTL 0x0085F0 diff --git a/src/gallium/winsys/radeon/drm/radeon_winsys.h b/src/gallium/winsys/radeon/drm/radeon_winsys.h index a619d709754..9c6589a7a96 100644 --- a/src/gallium/winsys/radeon/drm/radeon_winsys.h +++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h @@ -501,4 +501,16 @@ struct radeon_winsys { enum radeon_value_id value); }; +static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) +{ + cs->buf[cs->cdw++] = value; +} + +static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs, + const uint32_t *values, unsigned count) +{ + memcpy(cs->buf+cs->cdw, values, count * 4); + cs->cdw += count; +} + #endif -- 2.30.2