radeonsi: implement ARB_draw_indirect
authorMarek Olšák <marek.olsak@amd.com>
Thu, 24 Apr 2014 01:03:43 +0000 (03:03 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 17 Jul 2014 23:58:58 +0000 (01:58 +0200)
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
docs/GL3.txt
docs/relnotes/10.3.html
src/gallium/drivers/radeonsi/si_commands.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/sid.h

index 924f47990a6ccf39b138ff59c87fe45ca4bca2c8..0f37da4102d677af64d3a0bd83940d427e17afc8 100644 (file)
@@ -98,7 +98,7 @@ GL 4.0:
 
   GLSL 4.0                                             not started
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, nvc0, r600, radeonsi, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0)
+  GL_ARB_draw_indirect                                 DONE (i965, nvc0, radeonsi, softpipe, llvmpipe)
   GL_ARB_gpu_shader5                                   started
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          started (Chris)
@@ -165,7 +165,7 @@ GL 4.3:
   GL_ARB_framebuffer_no_attachments                    not started
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0)
+  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, radeonsi, softpipe, llvmpipe)
   GL_ARB_program_interface_query                       not started
   GL_ARB_robust_buffer_access_behavior                 not started
   GL_ARB_shader_image_size                             not started
index b757e5f017514d341179d9db810ae1e1fd97d201..90247c09c05ea2136fd095b14ca79edac1fb2b9f 100644 (file)
@@ -45,7 +45,9 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_ARB_compressed_texture_pixel_storage on all drivers</li>
+<li>GL_ARB_draw_indirect on nvc0, radeonsi</li>
 <li>GL_ARB_explicit_uniform_location (all drivers that support GLSL)</li>
+<li>GL_ARB_multi_draw_indirect on nvc0, radeonsi</li>
 <li>GL_ARB_sample_shading on radeonsi</li>
 <li>GL_ARB_stencil_texturing on nv50, nvc0, r600, and radeonsi</li>
 <li>GL_ARB_texture_cube_map_array on radeonsi</li>
@@ -56,7 +58,6 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_seamless_cubemap_per_texture on i965, llvmpipe, nvc0, r600, radeonsi, softpipe</li>
 <li>GL_ARB_fragment_layer_viewport on nv50, nvc0, llvmpipe, r600</li>
 <li>GL_AMD_vertex_shader_viewport_index on i965/gen7+, r600</li>
-<li>GL_ARB_(multi_)draw_indirect on nvc0</li>
 </ul>
 
 
index 5ddc40e1ec049f6f269a100a527bac5487e72916..2efdedaf4993ce644ab1c03ea7c6a54cc8b94e11 100644 (file)
@@ -57,6 +57,59 @@ void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count,
        si_pm4_cmd_end(pm4, predicate);
 }
 
+void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+                         uint32_t indirect_offset, uint32_t base_vtx_loc,
+                         uint32_t start_inst_loc, bool predicate)
+{
+       assert(indirect_va % 8 == 0);
+       assert(indirect_offset % 4 == 0);
+
+       si_pm4_cmd_begin(pm4, PKT3_SET_BASE);
+       si_pm4_cmd_add(pm4, 1);
+       si_pm4_cmd_add(pm4, indirect_va);
+       si_pm4_cmd_add(pm4, indirect_va >> 32);
+       si_pm4_cmd_end(pm4, predicate);
+
+       si_pm4_cmd_begin(pm4, PKT3_DRAW_INDIRECT);
+       si_pm4_cmd_add(pm4, indirect_offset);
+       si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2);
+       si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2);
+       si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+       si_pm4_cmd_end(pm4, predicate);
+}
+
+void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+                               uint64_t index_va, uint32_t index_max_size,
+                               uint32_t indirect_offset, uint32_t base_vtx_loc,
+                               uint32_t start_inst_loc, bool predicate)
+{
+       assert(indirect_va % 8 == 0);
+       assert(index_va % 2 == 0);
+       assert(indirect_offset % 4 == 0);
+
+       si_pm4_cmd_begin(pm4, PKT3_SET_BASE);
+       si_pm4_cmd_add(pm4, 1);
+       si_pm4_cmd_add(pm4, indirect_va);
+       si_pm4_cmd_add(pm4, indirect_va >> 32);
+       si_pm4_cmd_end(pm4, predicate);
+
+       si_pm4_cmd_begin(pm4, PKT3_INDEX_BASE);
+       si_pm4_cmd_add(pm4, index_va);
+       si_pm4_cmd_add(pm4, index_va >> 32);
+       si_pm4_cmd_end(pm4, predicate);
+
+       si_pm4_cmd_begin(pm4, PKT3_INDEX_BUFFER_SIZE);
+       si_pm4_cmd_add(pm4, index_max_size);
+       si_pm4_cmd_end(pm4, predicate);
+
+       si_pm4_cmd_begin(pm4, PKT3_DRAW_INDEX_INDIRECT);
+       si_pm4_cmd_add(pm4, indirect_offset);
+       si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2);
+       si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2);
+       si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_DMA);
+       si_pm4_cmd_end(pm4, predicate);
+}
+
 void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl)
 {
        if (pm4->chip_class >= CIK) {
index cef6e5073e26d1f4faf8bd2a6df927f96ce957aa..4f1926872897f05edb9d6003cbb2b3c1e030e8bc 100644 (file)
@@ -214,6 +214,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
        case PIPE_CAP_CUBE_MAP_ARRAY:
        case PIPE_CAP_SAMPLE_SHADING:
+       case PIPE_CAP_DRAW_INDIRECT:
                return 1;
 
        case PIPE_CAP_TEXTURE_MULTISAMPLE:
index fc3b1b97ea1fc9f74962b753e647a62372405311..ae42e6673b58b7b7b71cea25d00f4c88742a809c 100644 (file)
@@ -265,6 +265,13 @@ void si_cmd_draw_index_2(struct si_pm4_state *pm4, uint32_t max_size,
                         uint32_t initiator, bool predicate);
 void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count,
                            uint32_t initiator, bool predicate);
+void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+                         uint32_t indirect_offset, uint32_t base_vtx_loc,
+                         uint32_t start_inst_loc, bool predicate);
+void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+                               uint64_t index_va, uint32_t index_max_size,
+                               uint32_t indirect_offset, uint32_t base_vtx_loc,
+                               uint32_t start_inst_loc, bool predicate);
 void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl);
 
 #endif
index e2b29c349bedd454d819391522b6fdc0493d461a..bac18464a1d5b2ca7acb4a562040be99009cbf0e 100644 (file)
@@ -783,15 +783,18 @@ static void si_state_draw(struct si_context *sctx,
        }
        si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
 
-       si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES);
-       si_pm4_cmd_add(pm4, info->instance_count);
-       si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
-
        if (!info->indirect) {
+               si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES);
+               si_pm4_cmd_add(pm4, info->instance_count);
+               si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
+
                si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
                               info->indexed ? info->index_bias : info->start);
                si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_START_INSTANCE * 4,
                               info->start_instance);
+       } else {
+               si_pm4_add_bo(pm4, (struct r600_resource *)info->indirect,
+                             RADEON_USAGE_READ, RADEON_PRIO_MIN);
        }
 
        if (info->indexed) {
@@ -803,14 +806,35 @@ static void si_state_draw(struct si_context *sctx,
 
                si_pm4_add_bo(pm4, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ,
                              RADEON_PRIO_MIN);
-               va += info->start * ib->index_size;
-               si_cmd_draw_index_2(pm4, max_size, va, info->count,
-                                   V_0287F0_DI_SRC_SEL_DMA,
-                                   sctx->b.predicate_drawing);
+
+               if (info->indirect) {
+                       uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b,
+                                                               info->indirect);
+                       si_cmd_draw_index_indirect(pm4, indirect_va, va, max_size,
+                                                  info->indirect_offset,
+                                                  sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
+                                                  sh_base_reg + SI_SGPR_START_INSTANCE * 4,
+                                                  sctx->b.predicate_drawing);
+               } else {
+                       va += info->start * ib->index_size;
+                       si_cmd_draw_index_2(pm4, max_size, va, info->count,
+                                           V_0287F0_DI_SRC_SEL_DMA,
+                                           sctx->b.predicate_drawing);
+               }
        } else {
-               uint32_t initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-               initiator |= S_0287F0_USE_OPAQUE(!!info->count_from_stream_output);
-               si_cmd_draw_index_auto(pm4, info->count, initiator, sctx->b.predicate_drawing);
+               if (info->indirect) {
+                       uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b,
+                                                               info->indirect);
+                       si_cmd_draw_indirect(pm4, indirect_va, info->indirect_offset,
+                                            sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
+                                            sh_base_reg + SI_SGPR_START_INSTANCE * 4,
+                                            sctx->b.predicate_drawing);
+               } else {
+                       si_cmd_draw_index_auto(pm4, info->count,
+                                              V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+                                              S_0287F0_USE_OPAQUE(!!info->count_from_stream_output),
+                                              sctx->b.predicate_drawing);
+               }
        }
 
        si_pm4_set_state(sctx, draw, pm4);
@@ -898,13 +922,32 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato
 
 const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 13 }; /* number of CS dwords */
 
+static void si_get_draw_start_count(struct si_context *sctx,
+                                   const struct pipe_draw_info *info,
+                                   unsigned *start, unsigned *count)
+{
+       if (info->indirect) {
+               struct r600_resource *indirect =
+                       (struct r600_resource*)info->indirect;
+               int *data = r600_buffer_map_sync_with_rings(&sctx->b,
+                                       indirect, PIPE_TRANSFER_READ);
+                data += info->indirect_offset/sizeof(int);
+               *start = data[2];
+               *count = data[0];
+       } else {
+               *start = info->start;
+               *count = info->count;
+       }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct pipe_index_buffer ib = {};
        uint32_t i;
 
-       if (!info->count && (info->indexed || !info->count_from_stream_output))
+       if (!info->count && !info->indirect &&
+           (info->indexed || !info->count_from_stream_output))
                return;
 
        if (!sctx->ps_shader || !sctx->vs_shader)
@@ -926,8 +969,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                        unsigned out_offset, start, count, start_offset;
                        void *ptr;
 
-                       start = info->start;
-                       count = info->count;
+                       si_get_draw_start_count(sctx, info, &start, &count);
                        start_offset = start * ib.index_size;
 
                        u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
@@ -946,8 +988,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                } else if (ib.user_buffer && !ib.buffer) {
                        unsigned start, count, start_offset;
 
-                       start = info->start;
-                       count = info->count;
+                       si_get_draw_start_count(sctx, info, &start, &count);
                        start_offset = start * ib.index_size;
 
                        u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
index e3f788ef6a7085928198e94ecd230419e5e176ea..3241725ca994059126ac9bffbe0061eb4ccf8949 100644 (file)
 #define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
 
 #define PKT3_NOP                               0x10
+#define PKT3_SET_BASE                          0x11
+#define PKT3_CLEAR_STATE                       0x12
+#define PKT3_INDEX_BUFFER_SIZE                 0x13
 #define PKT3_DISPATCH_DIRECT                   0x15
 #define PKT3_DISPATCH_INDIRECT                 0x16
 #define PKT3_OCCLUSION_QUERY                   0x1F /* new for CIK */
 #define PKT3_SET_PREDICATION                   0x20
 #define PKT3_COND_EXEC                         0x22
 #define PKT3_PRED_EXEC                         0x23
+#define PKT3_DRAW_INDIRECT                     0x24
+#define PKT3_DRAW_INDEX_INDIRECT               0x25
+#define PKT3_INDEX_BASE                        0x26
 #define PKT3_DRAW_INDEX_2                      0x27
 #define PKT3_CONTEXT_CONTROL                   0x28
 #define PKT3_INDEX_TYPE                        0x2A
+#define PKT3_DRAW_INDIRECT_MULTI               0x2C
 #define PKT3_DRAW_INDEX_AUTO                   0x2D
 #define PKT3_DRAW_INDEX_IMMD                   0x2E /* not on CIK */
 #define PKT3_NUM_INSTANCES                     0x2F
+#define PKT3_DRAW_INDEX_MULTI_AUTO             0x30
+#define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_STRMOUT_BUFFER_UPDATE             0x34
 #define PKT3_DRAW_INDEX_OFFSET_2               0x35
 #define PKT3_DRAW_PREAMBLE                     0x36 /* new on CIK, required on GFX7.2 and later */
 #define PKT3_WRITE_DATA_ENGINE_SEL_ME              0
 #define PKT3_WRITE_DATA_ENGINE_SEL_PFP             1
 #define PKT3_WRITE_DATA_ENGINE_SEL_CE              2
+#define PKT3_DRAW_INDEX_INDIRECT_MULTI         0x38
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A /* not on CIK */
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define                WAIT_REG_MEM_EQUAL              3
 #define PKT3_MEM_WRITE                         0x3D /* not on CIK */
-#define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_COPY_DATA                        0x40
 #define                COPY_DATA_SRC_SEL(x)            ((x) & 0xf)
 #define                        COPY_DATA_REG           0