radeonsi: add flexible shader descriptor management and use it for sampler views
authorMarek Olšák <marek.olsak@amd.com>
Tue, 6 Aug 2013 04:42:22 +0000 (06:42 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Fri, 16 Aug 2013 23:48:25 +0000 (01:48 +0200)
It moves all sampler view descriptors to a buffer.
It supports partial resource updates and it can also unbind resources
(required for FMASK texturing).

The buffer contains all sampler view descriptors for one shader stage,
represented as an array. On top of that, there are N arrays in the buffer,
which are used to emulate context registers as implemented by the previous
ASICs (each array is a context).

This uses the RCU synchronization approach to avoid read-after-write hazards
as discussed in the thread:
"radeonsi: add FMASK texture binding slots and resource setup"

CP DMA is used to clear the descriptors at context initialization and to copy
the descriptors from one context to the next.

v2: - use PKT3_DMA_DATA on CIK (I'll test CIK later)
    - turn the bool CP DMA parameters into self-explanatory flags
    - add a nice simple API for packet emission to radeon_winsys.h
    - use 256 contexts, 128 causes texture corruption in openarena

src/gallium/drivers/radeonsi/Makefile.sources
src/gallium/drivers/radeonsi/r600_blit.c
src/gallium/drivers/radeonsi/r600_hw_context.c
src/gallium/drivers/radeonsi/radeonsi_pipe.c
src/gallium/drivers/radeonsi/radeonsi_pipe.h
src/gallium/drivers/radeonsi/si_descriptors.c [new file with mode: 0644]
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/sid.h
src/gallium/winsys/radeon/drm/radeon_winsys.h

index b3ffa727eb5d5f0aaab8359555a38ad3d9ac2239..68c828200151e667e69e7995cfadc60dcba9fed8 100644 (file)
@@ -10,6 +10,7 @@ C_SOURCES := \
        r600_translate.c \
        radeonsi_pm4.c \
        radeonsi_compute.c \
+       si_descriptors.c \
        si_state.c \
        si_state_streamout.c \
        si_state_draw.c \
index bab108e7c91dd8bda9f56645309fcd952c0588ca..bdd9bb43c108b6aaae82422fc7c6a26f62cbf887 100644 (file)
@@ -70,12 +70,12 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
 
        if (op & R600_SAVE_TEXTURES) {
                util_blitter_save_fragment_sampler_states(
-                       rctx->blitter, rctx->ps_samplers.n_samplers,
-                       (void**)rctx->ps_samplers.samplers);
+                       rctx->blitter, rctx->samplers[PIPE_SHADER_FRAGMENT].n_samplers,
+                       (void**)rctx->samplers[PIPE_SHADER_FRAGMENT].samplers);
 
-               util_blitter_save_fragment_sampler_views(
-                       rctx->blitter, rctx->ps_samplers.n_views,
-                       (struct pipe_sampler_view**)rctx->ps_samplers.views);
+               util_blitter_save_fragment_sampler_views(rctx->blitter,
+                       util_last_bit(rctx->samplers[PIPE_SHADER_FRAGMENT].views.desc.enabled_mask),
+                       rctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
        }
 
        if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) {
@@ -224,7 +224,7 @@ void si_flush_depth_textures(struct r600_context *rctx,
                struct pipe_sampler_view *view;
                struct r600_texture *tex;
 
-               view = &textures->views[i]->base;
+               view = textures->views.views[i];
                if (!view) continue;
 
                tex = (struct r600_texture *)view->texture;
index 25c972bbf62575d2f95eeadce9d7169ab9623dd3..bc6ba0bd1f01b6056dfe4cff3f87d5a81fa1a718 100644 (file)
@@ -114,9 +114,17 @@ err:
 void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
                        boolean count_draw_in)
 {
+       int i;
+
        /* The number of dwords we already used in the CS so far. */
        num_dw += ctx->cs->cdw;
 
+       for (i = 0; i < SI_NUM_ATOMS(ctx); i++) {
+               if (ctx->atoms.array[i]->dirty) {
+                       num_dw += ctx->atoms.array[i]->num_dw;
+               }
+       }
+
        if (count_draw_in) {
                /* The number of dwords all the dirty states would take. */
                num_dw += ctx->pm4_dirty_cdwords;
@@ -254,6 +262,15 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
        ctx->pm4_dirty_cdwords = 0;
        ctx->flags = 0;
 
+       /* set all valid group as dirty so they get reemited on
+        * next draw command
+        */
+       si_pm4_reset_emitted(ctx);
+
+       /* The CS initialization should be emitted before everything else. */
+       si_pm4_emit(ctx, ctx->queued.named.init);
+       ctx->emitted.named.init = ctx->queued.named.init;
+
 #if 0
        if (streamout_suspended) {
                ctx->streamout_start = TRUE;
@@ -266,10 +283,7 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
                r600_context_queries_resume(ctx);
        }
 
-       /* set all valid group as dirty so they get reemited on
-        * next draw command
-        */
-       si_pm4_reset_emitted(ctx);
+       si_all_descriptors_begin_new_cs(ctx);
 }
 
 void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
index b4a1ca93fe6f5121b478c3de6aaadd0aaa791747..9afc7f2714b68bb740e57be190bb323f9828a7ca 100644 (file)
@@ -178,6 +178,8 @@ static void r600_destroy_context(struct pipe_context *context)
 {
        struct r600_context *rctx = (struct r600_context *)context;
 
+       si_release_all_descriptors(rctx);
+
        si_resource_reference(&rctx->border_color_table, NULL);
 
        if (rctx->dummy_pixel_shader) {
@@ -231,12 +233,15 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
                rctx->context.create_video_buffer = vl_video_buffer_create;
        }
 
+       rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL);
+
+       si_init_all_descriptors(rctx);
+
        switch (rctx->chip_class) {
        case SI:
        case CIK:
                si_init_state_functions(rctx);
                LIST_INITHEAD(&rctx->active_query_list);
-               rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL);
                rctx->max_db = 8;
                si_init_config(rctx);
                break;
index 6fbe6539d8765b856a641359bb66c57f5f4c6e38..674c6303b7acf419a7387952eb7f14ab47816d55 100644 (file)
@@ -94,11 +94,8 @@ struct si_cs_shader_state {
        struct si_pipe_compute          *program;
 };
 
-/* needed for blitter save */
-#define NUM_TEX_UNITS 16
-
 struct r600_textures_info {
-       struct si_pipe_sampler_view     *views[NUM_TEX_UNITS];
+       struct si_sampler_views         views;
        struct si_pipe_sampler_state    *samplers[NUM_TEX_UNITS];
        unsigned                        n_views;
        uint32_t                        depth_texture_mask; /* which textures are depth */
@@ -131,6 +128,9 @@ struct r600_constbuf_state
        uint32_t                        dirty_mask;
 };
 
+#define SI_NUM_ATOMS(rctx) (sizeof((rctx)->atoms)/sizeof((rctx)->atoms.array[0]))
+#define SI_NUM_SHADERS (PIPE_SHADER_FRAGMENT+1)
+
 struct r600_context {
        struct pipe_context             context;
        struct blitter_context          *blitter;
@@ -142,6 +142,14 @@ struct r600_context {
        void                            *custom_dsa_flush_inplace;
        struct r600_screen              *screen;
        struct radeon_winsys            *ws;
+
+       union {
+               struct {
+                       struct si_atom *sampler_views[SI_NUM_SHADERS];
+               };
+               struct si_atom *array[0];
+       } atoms;
+
        struct si_vertex_element        *vertex_elements;
        struct pipe_framebuffer_state   framebuffer;
        unsigned                        pa_sc_line_stipple;
@@ -161,8 +169,7 @@ struct r600_context {
        unsigned                        sprite_coord_enable;
        unsigned                        export_16bpc;
        struct r600_constbuf_state      constbuf_state[PIPE_SHADER_TYPES];
-       struct r600_textures_info       vs_samplers;
-       struct r600_textures_info       ps_samplers;
+       struct r600_textures_info       samplers[SI_NUM_SHADERS];
        struct si_resource              *border_color_table;
        unsigned                        border_color_offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
new file mode 100644 (file)
index 0000000..f05c8f4
--- /dev/null
@@ -0,0 +1,355 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Marek Olšák <marek.olsak@amd.com>
+ */
+
+#include "radeonsi_pipe.h"
+#include "radeonsi_resource.h"
+#include "radeonsi_shader.h"
+#include "r600_hw_context_priv.h"
+
+#include "util/u_memory.h"
+
+#define SI_NUM_CONTEXTS 256
+
+static const uint32_t null_desc[8]; /* zeros */
+
+/* Set this if you want the 3D engine to wait until CP DMA is done.
+ * It should be set on the last CP DMA packet. */
+#define R600_CP_DMA_SYNC       (1 << 0) /* R600+ */
+
+/* Set this if the source data was used as a destination in a previous CP DMA
+ * packet. It's for preventing a read-after-write (RAW) hazard between two
+ * CP DMA packets. */
+#define SI_CP_DMA_RAW_WAIT     (1 << 1) /* SI+ */
+
+/* Emit a CP DMA packet to do a copy from one buffer to another.
+ * The size must fit in bits [20:0]. Notes:
+ */
+static void si_emit_cp_dma_copy_buffer(struct r600_context *rctx,
+                                      uint64_t dst_va, uint64_t src_va,
+                                      unsigned size, unsigned flags)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+       uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+
+       assert(size);
+       assert((size & ((1<<21)-1)) == size);
+
+       if (rctx->chip_class >= CIK) {
+               radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+               radeon_emit(cs, sync_flag);             /* CP_SYNC [31] */
+               radeon_emit(cs, src_va);                /* SRC_ADDR_LO [31:0] */
+               radeon_emit(cs, src_va >> 32);          /* SRC_ADDR_HI [31:0] */
+               radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
+               radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [31:0] */
+               radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+       } else {
+               radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+               radeon_emit(cs, src_va);                        /* SRC_ADDR_LO [31:0] */
+               radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
+               radeon_emit(cs, dst_va);                        /* DST_ADDR_LO [31:0] */
+               radeon_emit(cs, (dst_va >> 32) & 0xffff);       /* DST_ADDR_HI [15:0] */
+               radeon_emit(cs, size | raw_wait);               /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+       }
+}
+
+/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
+static void si_emit_cp_dma_clear_buffer(struct r600_context *rctx,
+                                       uint64_t dst_va, unsigned size,
+                                       uint32_t clear_value, unsigned flags)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+       uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+
+       assert(size);
+       assert((size & ((1<<21)-1)) == size);
+
+       if (rctx->chip_class >= CIK) {
+               radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+               radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+               radeon_emit(cs, clear_value);           /* DATA [31:0] */
+               radeon_emit(cs, 0);
+               radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
+               radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [15:0] */
+               radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+       } else {
+               radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+               radeon_emit(cs, clear_value);           /* DATA [31:0] */
+               radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+               radeon_emit(cs, dst_va);                        /* DST_ADDR_LO [31:0] */
+               radeon_emit(cs, (dst_va >> 32) & 0xffff);       /* DST_ADDR_HI [15:0] */
+               radeon_emit(cs, size | raw_wait);               /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+       }
+}
+
+static void si_init_descriptors(struct r600_context *rctx,
+                               struct si_descriptors *desc,
+                               unsigned shader_userdata_reg,
+                               unsigned element_dw_size,
+                               unsigned num_elements,
+                               void (*emit_func)(struct r600_context *ctx, struct si_atom *state))
+{
+       uint64_t va;
+
+       desc->atom.emit = emit_func;
+       desc->shader_userdata_reg = shader_userdata_reg;
+       desc->element_dw_size = element_dw_size;
+       desc->num_elements = num_elements;
+       desc->context_size = num_elements * element_dw_size * 4;
+
+       desc->buffer = (struct si_resource*)
+               pipe_buffer_create(rctx->context.screen, PIPE_BIND_CUSTOM,
+                                  PIPE_USAGE_STATIC,
+                                  SI_NUM_CONTEXTS * desc->context_size);
+
+       r600_context_bo_reloc(rctx, desc->buffer, RADEON_USAGE_READWRITE);
+       va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
+
+       /* We don't check for CS space here, because this should be called
+        * only once at context initialization. */
+       si_emit_cp_dma_clear_buffer(rctx, va, desc->buffer->b.b.width0, 0,
+                                   R600_CP_DMA_SYNC);
+}
+
+static void si_release_descriptors(struct si_descriptors *desc)
+{
+       pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+}
+
+static void si_update_descriptors(struct si_descriptors *desc)
+{
+       if (desc->dirty_mask) {
+               desc->atom.num_dw =
+                       7 + /* copy */
+                       (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */
+                       4; /* pointer update */
+               desc->atom.dirty = true;
+       } else {
+               desc->atom.dirty = false;
+       }
+}
+
+static void si_emit_shader_pointer(struct r600_context *rctx,
+                                  struct si_descriptors *desc)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       uint64_t va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b) +
+                     desc->current_context_id * desc->context_size;
+
+       radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
+       radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
+       radeon_emit(cs, va);
+       radeon_emit(cs, va >> 32);
+}
+
+static void si_emit_descriptors(struct r600_context *rctx,
+                               struct si_descriptors *desc,
+                               const uint32_t **descriptors)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       uint64_t va_base;
+       int packet_start;
+       int packet_size = 0;
+       int last_index = desc->num_elements; /* point to a non-existing element */
+       unsigned dirty_mask = desc->dirty_mask;
+       unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
+
+       assert(dirty_mask);
+
+       va_base = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
+
+       /* Copy the descriptors to a new context slot. */
+       si_emit_cp_dma_copy_buffer(rctx,
+                                  va_base + new_context_id * desc->context_size,
+                                  va_base + desc->current_context_id * desc->context_size,
+                                  desc->context_size, R600_CP_DMA_SYNC);
+
+       va_base += new_context_id * desc->context_size;
+
+       /* Update the descriptors.
+        * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
+        *
+        * XXX When unbinding lots of resources, consider clearing the memory
+        *     with CP DMA instead of emitting zeros.
+        */
+       while (dirty_mask) {
+               int i = u_bit_scan(&dirty_mask);
+
+               assert(i < desc->num_elements);
+
+               if (last_index+1 == i && packet_size) {
+                       /* Append new data at the end of the last packet. */
+                       packet_size += desc->element_dw_size;
+                       cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
+               } else {
+                       /* Start a new packet. */
+                       uint64_t va = va_base + i * desc->element_dw_size * 4;
+
+                       packet_start = cs->cdw;
+                       packet_size = 2 + desc->element_dw_size;
+
+                       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
+                       radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
+                                            PKT3_WRITE_DATA_WR_CONFIRM |
+                                            PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
+                       radeon_emit(cs, va & 0xFFFFFFFFUL);
+                       radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
+               }
+
+               radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
+
+               last_index = i;
+       }
+
+       desc->dirty_mask = 0;
+       desc->current_context_id = new_context_id;
+
+       /* Now update the shader userdata pointer. */
+       si_emit_shader_pointer(rctx, desc);
+}
+
+static unsigned si_get_shader_user_data_base(unsigned shader)
+{
+       switch (shader) {
+       case PIPE_SHADER_VERTEX:
+               return R_00B130_SPI_SHADER_USER_DATA_VS_0;
+       case PIPE_SHADER_GEOMETRY:
+               return R_00B230_SPI_SHADER_USER_DATA_GS_0;
+       case PIPE_SHADER_FRAGMENT:
+               return R_00B030_SPI_SHADER_USER_DATA_PS_0;
+       default:
+               assert(0);
+               return 0;
+       }
+}
+
+/* SAMPLER VIEWS */
+
+static void si_emit_sampler_views(struct r600_context *rctx, struct si_atom *atom)
+{
+       struct si_sampler_views *views = (struct si_sampler_views*)atom;
+
+       si_emit_descriptors(rctx, &views->desc, views->desc_data);
+}
+
+static void si_init_sampler_views(struct r600_context *rctx,
+                                 struct si_sampler_views *views,
+                                 unsigned shader)
+{
+       si_init_descriptors(rctx, &views->desc,
+                           si_get_shader_user_data_base(shader) +
+                           SI_SGPR_RESOURCE * 4,
+                           8, 16, si_emit_sampler_views);
+}
+
+static void si_release_sampler_views(struct si_sampler_views *views)
+{
+       int i;
+
+       for (i = 0; i < Elements(views->views); i++) {
+               pipe_sampler_view_reference(&views->views[i], NULL);
+       }
+       si_release_descriptors(&views->desc);
+}
+
+static void si_sampler_views_begin_new_cs(struct r600_context *rctx,
+                                         struct si_sampler_views *views)
+{
+       unsigned mask = views->desc.enabled_mask;
+
+       /* Add relocations to the CS. */
+       while (mask) {
+               int i = u_bit_scan(&mask);
+               struct si_pipe_sampler_view *rview =
+                       (struct si_pipe_sampler_view*)views->views[i];
+
+               r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
+       }
+
+       r600_context_bo_reloc(rctx, views->desc.buffer, RADEON_USAGE_READWRITE);
+
+       si_emit_shader_pointer(rctx, &views->desc);
+}
+
+void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
+                        unsigned slot, struct pipe_sampler_view *view,
+                        unsigned *view_desc)
+{
+       struct si_sampler_views *views = &rctx->samplers[shader].views;
+
+       if (views->views[slot] == view)
+               return;
+
+       if (view) {
+               struct si_pipe_sampler_view *rview =
+                       (struct si_pipe_sampler_view*)view;
+
+               r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
+
+               pipe_sampler_view_reference(&views->views[slot], view);
+               views->desc_data[slot] = view_desc;
+               views->desc.enabled_mask |= 1 << slot;
+       } else {
+               pipe_sampler_view_reference(&views->views[slot], NULL);
+               views->desc_data[slot] = null_desc;
+               views->desc.enabled_mask &= ~(1 << slot);
+       }
+
+       views->desc.dirty_mask |= 1 << slot;
+       si_update_descriptors(&views->desc);
+}
+
+/* INIT/DEINIT */
+
+void si_init_all_descriptors(struct r600_context *rctx)
+{
+       int i;
+
+       for (i = 0; i < SI_NUM_SHADERS; i++) {
+               si_init_sampler_views(rctx, &rctx->samplers[i].views, i);
+
+               rctx->atoms.sampler_views[i] = &rctx->samplers[i].views.desc.atom;
+       }
+}
+
+void si_release_all_descriptors(struct r600_context *rctx)
+{
+       int i;
+
+       for (i = 0; i < SI_NUM_SHADERS; i++) {
+               si_release_sampler_views(&rctx->samplers[i].views);
+       }
+}
+
+void si_all_descriptors_begin_new_cs(struct r600_context *rctx)
+{
+       int i;
+
+       for (i = 0; i < SI_NUM_SHADERS; i++) {
+               si_sampler_views_begin_new_cs(rctx, &rctx->samplers[i].views);
+       }
+}
index d1e3c9d5279962e8153c3740f9936c5fba122d5e..7d637e7518984e9aad1d5a0375c7d3b7561adc5d 100644 (file)
@@ -2493,26 +2493,17 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
 }
 
 static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
-                                                unsigned count,
-                                                struct pipe_sampler_view **views,
-                                                struct r600_textures_info *samplers,
-                                                unsigned user_data_reg)
+                                                unsigned shader, unsigned count,
+                                                struct pipe_sampler_view **views)
 {
-       struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views;
+       struct r600_textures_info *samplers = &rctx->samplers[shader];
+       struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views;
        struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
-       int i, j;
-
-       if (!count)
-               goto out;
+       int i;
 
        si_pm4_inval_texture_cache(pm4);
 
-       si_pm4_sh_data_begin(pm4);
        for (i = 0; i < count; i++) {
-               pipe_sampler_view_reference(
-                       (struct pipe_sampler_view **)&samplers->views[i],
-                       views[i]);
-
                if (views[i]) {
                        struct r600_texture *rtex =
                                (struct r600_texture*)views[i]->texture;
@@ -2523,25 +2514,17 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
                                samplers->depth_texture_mask &= ~(1 << i);
                        }
 
-                       si_pm4_add_bo(pm4, resource[i]->resource, RADEON_USAGE_READ);
+                       si_set_sampler_view(rctx, shader, i, views[i], rviews[i]->state);
                } else {
                        samplers->depth_texture_mask &= ~(1 << i);
-               }
-
-               for (j = 0; j < Elements(resource[i]->state); ++j) {
-                       si_pm4_sh_data_add(pm4, resource[i] ? resource[i]->state[j] : 0);
+                       si_set_sampler_view(rctx, shader, i, NULL, NULL);
                }
        }
-
-       for (i = count; i < NUM_TEX_UNITS; i++) {
-               if (samplers->views[i])
-                       pipe_sampler_view_reference((struct pipe_sampler_view **)&samplers->views[i], NULL);
+       for (; i < samplers->n_views; i++) {
+               si_set_sampler_view(rctx, shader, i, NULL, NULL);
        }
 
-       si_pm4_sh_data_end(pm4, user_data_reg, SI_SGPR_RESOURCE);
-
-out:
-       rctx->ps_samplers.n_views = count;
+       samplers->n_views = count;
        return pm4;
 }
 
@@ -2551,8 +2534,7 @@ static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count,
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct si_pm4_state *pm4;
 
-       pm4 = si_set_sampler_views(rctx, count, views, &rctx->vs_samplers,
-                           R_00B130_SPI_SHADER_USER_DATA_VS_0);
+       pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views);
        si_pm4_set_state(rctx, vs_sampler_views, pm4);
 }
 
@@ -2562,8 +2544,7 @@ static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count,
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct si_pm4_state *pm4;
 
-       pm4 = si_set_sampler_views(rctx, count, views, &rctx->ps_samplers,
-                                 R_00B030_SPI_SHADER_USER_DATA_PS_0);
+       pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views);
        si_pm4_set_state(rctx, ps_sampler_views, pm4);
 }
 
@@ -2646,7 +2627,7 @@ static void si_bind_vs_sampler_states(struct pipe_context *ctx, unsigned count,
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct si_pm4_state *pm4;
 
-       pm4 = si_bind_sampler_states(rctx, count, states, &rctx->vs_samplers,
+       pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_VERTEX],
                              R_00B130_SPI_SHADER_USER_DATA_VS_0);
        si_pm4_set_state(rctx, vs_sampler, pm4);
 }
@@ -2656,7 +2637,7 @@ static void si_bind_ps_sampler_states(struct pipe_context *ctx, unsigned count,
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct si_pm4_state *pm4;
 
-       pm4 = si_bind_sampler_states(rctx, count, states, &rctx->ps_samplers,
+       pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_FRAGMENT],
                              R_00B030_SPI_SHADER_USER_DATA_PS_0);
        si_pm4_set_state(rctx, ps_sampler, pm4);
 }
index 7ce084e579411ec94f3b31262fef40171bb5a353..610303bb9a5a1877a8f19e9bc517c0a5ea2ac0b9 100644 (file)
 
 #include "radeonsi_pm4.h"
 
+/* This encapsulates a state or an operation which can emitted into the GPU
+ * command stream. */
+struct si_atom {
+       void (*emit)(struct r600_context *ctx, struct si_atom *state);
+       unsigned                num_dw;
+       bool                    dirty;
+};
+
 struct si_state_blend {
        struct si_pm4_state     pm4;
        uint32_t                cb_target_mask;
@@ -103,6 +111,46 @@ union si_state {
        struct si_pm4_state     *array[0];
 };
 
+#define NUM_TEX_UNITS 16
+
+/* This represents resource descriptors in memory, such as buffer resources,
+ * image resources, and sampler states.
+ */
+struct si_descriptors {
+       struct si_atom atom;
+
+       /* The size of one resource descriptor. */
+       unsigned element_dw_size;
+       /* The maximum number of resource descriptors. */
+       unsigned num_elements;
+
+       /* The buffer where resource descriptors are stored. */
+       struct si_resource *buffer;
+
+       /* The i-th bit is set if that element is dirty (changed but not emitted). */
+       unsigned dirty_mask;
+       /* The i-th bit is set if that element is enabled (non-NULL resource). */
+       unsigned enabled_mask;
+
+       /* We can't update descriptors directly because the GPU might be
+        * reading them at the same time, so we have to update them
+        * in a copy-on-write manner. Each such copy is called a context,
+        * which is just another array descriptors in the same buffer. */
+       unsigned current_context_id;
+       /* The size of a context, should be equal to 4*element_dw_size*num_elements. */
+       unsigned context_size;
+
+       /* The shader userdata register where the 64-bit pointer to the descriptor
+        * array will be stored. */
+       unsigned shader_userdata_reg;
+};
+
+struct si_sampler_views {
+       struct si_descriptors           desc;
+       struct pipe_sampler_view        *views[NUM_TEX_UNITS];
+       const uint32_t                  *desc_data[NUM_TEX_UNITS];
+};
+
 #define si_pm4_block_idx(member) \
        (offsetof(union si_state, named.member) / sizeof(struct si_pm4_state *))
 
@@ -133,6 +181,14 @@ union si_state {
                } \
        } while(0)
 
+/* si_descriptors.c */
+void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
+                        unsigned slot, struct pipe_sampler_view *view,
+                        unsigned *view_desc);
+void si_init_all_descriptors(struct r600_context *rctx);
+void si_release_all_descriptors(struct r600_context *rctx);
+void si_all_descriptors_begin_new_cs(struct r600_context *rctx);
+
 /* si_state.c */
 struct si_pipe_shader_selector;
 
index 47e64d8634e6caa002c786b6c526e1349749a8c0..f03b34f403961fae0e958e6034f27d22eafa0e37 100644 (file)
@@ -412,11 +412,10 @@ static void si_update_derived_state(struct r600_context *rctx)
 
        if (!rctx->blitter->running) {
                /* Flush depth textures which need to be flushed. */
-               if (rctx->vs_samplers.depth_texture_mask) {
-                       si_flush_depth_textures(rctx, &rctx->vs_samplers);
-               }
-               if (rctx->ps_samplers.depth_texture_mask) {
-                       si_flush_depth_textures(rctx, &rctx->ps_samplers);
+               for (int i = 0; i < SI_NUM_SHADERS; i++) {
+                       if (rctx->samplers[i].depth_texture_mask) {
+                               si_flush_depth_textures(rctx, &rctx->samplers[i]);
+                       }
                }
        }
 
@@ -651,7 +650,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct pipe_index_buffer ib = {};
-       uint32_t cp_coher_cntl;
+       uint32_t cp_coher_cntl, i;
 
        if (!info->count && (info->indexed || !info->count_from_stream_output))
                return;
@@ -704,6 +703,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
        si_need_cs_space(rctx, 0, TRUE);
 
+       for (i = 0; i < SI_NUM_ATOMS(rctx); i++) {
+               if (rctx->atoms.array[i]->dirty) {
+                       rctx->atoms.array[i]->emit(rctx, rctx->atoms.array[i]);
+                       rctx->atoms.array[i]->dirty = false;
+               }
+       }
+
        si_pm4_emit_dirty(rctx);
        rctx->pm4_dirty_cdwords = 0;
 
index 208d3a88da0e0f2fc8f93c2992759e7cba938881..57ce72e06289af6fc04f635ea179eac0cf7ce103 100644 (file)
 #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
 #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
 
+#define PKT3_CP_DMA                                    0x41
+/* 1. header
+ * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
+ * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0]
+ * 4. DST_ADDR_LO [31:0]
+ * 5. DST_ADDR_HI [15:0]
+ * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
+ */
+#define PKT3_CP_DMA_CP_SYNC       (1 << 31)
+#define PKT3_CP_DMA_SRC_SEL(x)       ((x) << 29)
+/* 0 - SRC_ADDR
+ * 1 - GDS (program SAS to 1 as well)
+ * 2 - DATA
+ */
+#define PKT3_CP_DMA_DST_SEL(x)       ((x) << 20)
+/* 0 - DST_ADDR
+ * 1 - GDS (program DAS to 1 as well)
+ */
+/* COMMAND */
+#define PKT3_CP_DMA_CMD_SRC_SWAP(x) ((x) << 23)
+/* 0 - none
+ * 1 - 8 in 16
+ * 2 - 8 in 32
+ * 3 - 8 in 64
+ */
+#define PKT3_CP_DMA_CMD_DST_SWAP(x) ((x) << 24)
+/* 0 - none
+ * 1 - 8 in 16
+ * 2 - 8 in 32
+ * 3 - 8 in 64
+ */
+#define PKT3_CP_DMA_CMD_SAS       (1 << 26)
+/* 0 - memory
+ * 1 - register
+ */
+#define PKT3_CP_DMA_CMD_DAS       (1 << 27)
+/* 0 - memory
+ * 1 - register
+ */
+#define PKT3_CP_DMA_CMD_SAIC      (1 << 28)
+#define PKT3_CP_DMA_CMD_DAIC      (1 << 29)
+#define PKT3_CP_DMA_CMD_RAW_WAIT  (1 << 30)
+
+#define PKT3_DMA_DATA                                  0x50 /* new for CIK */
+/* 1. header
+ * 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0]
+ * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
+ * 3. SRC_ADDR_HI [31:0]
+ * 4. DST_ADDR_LO [31:0]
+ * 5. DST_ADDR_HI [31:0]
+ * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
+ */
+
+
 #define R_0084FC_CP_STRMOUT_CNTL                                       0x0084FC
 #define   S_0084FC_OFFSET_UPDATE_DONE(x)                             (((x) & 0x1) << 0)
 #define R_0085F0_CP_COHER_CNTL                                          0x0085F0
index a619d709754ae9eb9d709388e64c3232e8963620..9c6589a7a96c4f0bcca68ce116998e3a5fa80d22 100644 (file)
@@ -501,4 +501,16 @@ struct radeon_winsys {
                             enum radeon_value_id value);
 };
 
+static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+{
+    cs->buf[cs->cdw++] = value;
+}
+
+static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs,
+                                    const uint32_t *values, unsigned count)
+{
+    memcpy(cs->buf+cs->cdw, values, count * 4);
+    cs->cdw += count;
+}
+
 #endif