radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

author Marek Olšák <marek.olsak@amd.com>

Sun, 8 Oct 2017 01:44:07 +0000 (03:44 +0200)

committer Marek Olšák <marek.olsak@amd.com>

Tue, 17 Oct 2017 20:03:03 +0000 (22:03 +0200)
author Marek Olšák <marek.olsak@amd.com>
Sun, 8 Oct 2017 01:44:07 +0000 (03:44 +0200)
committer Marek Olšák <marek.olsak@amd.com>
Tue, 17 Oct 2017 20:03:03 +0000 (22:03 +0200)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c

index 0c1fca871818f48ec832eb06c70a4f9feb315d30..da6efa8394716fb54cc5d583664f0422d1128547 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -126,6 +126,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
         desc->element_dw_size = element_dw_size;
         desc->num_elements = num_elements;
         desc->shader_userdata_offset = shader_userdata_index * 4;
+       desc->slot_index_to_bind_directly = -1;
  }
  
  static void si_release_descriptors(struct si_descriptors *desc)
@@ -148,6 +149,20 @@ static bool si_upload_descriptors(struct si_context *sctx,
         if (!upload_size)
                 return true;
  
+       /* If there is just one active descriptor, bind it directly. */
+       if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+           desc->num_active_slots == 1) {
+               uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
+                                                  desc->element_dw_size];
+
+               /* The buffer is already in the buffer list. */
+               r600_resource_reference(&desc->buffer, NULL);
+               desc->gpu_list = NULL;
+               desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+               si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+               return true;
+       }
+
         uint32_t *ptr;
         int buffer_offset;
         u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
@@ -2531,14 +2546,15 @@ void si_init_all_descriptors(struct si_context *sctx)
                 bool gfx9_gs = false;
                 unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
                 unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+               struct si_descriptors *desc;
  
                 if (sctx->b.chip_class >= GFX9) {
                         gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
                         gfx9_gs = i == PIPE_SHADER_GEOMETRY;
                 }
  
-               si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
-                                        si_const_and_shader_buffer_descriptors(sctx, i),
+               desc = si_const_and_shader_buffer_descriptors(sctx, i);
+               si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
                                          num_buffer_slots,
                                          gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
                                          gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
@@ -2547,8 +2563,9 @@ void si_init_all_descriptors(struct si_context *sctx)
                                          RADEON_USAGE_READ,
                                          RADEON_PRIO_SHADER_RW_BUFFER,
                                          RADEON_PRIO_CONST_BUFFER);
+               desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
  
-               struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
+               desc = si_sampler_and_image_descriptors(sctx, i);
                 si_init_descriptors(desc,
                                     gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
                                     gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index f72e5af31fd0a28f2be8769e882100487aedad7c..c3fe13deeaac94cacf3920db0146575cefbdd5c3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1973,6 +1973,7 @@ static LLVMValueRef fetch_constant(
         unsigned swizzle)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct si_shader_selector *sel = ctx->shader->selector;
         const struct tgsi_ind_register *ireg = &reg->Indirect;
         unsigned buf, idx;
  
@@ -1996,9 +1997,60 @@ static LLVMValueRef fetch_constant(
                 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
         }
  
+       idx = reg->Register.Index * 4 + swizzle;
+       if (reg->Register.Indirect) {
+               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
+       } else {
+               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
+       }
+
+       /* Fast path when user data SGPRs point to constant buffer 0 directly. */
+       if (sel->info.const_buffers_declared == 1 &&
+           sel->info.shader_buffers_declared == 0) {
+               LLVMValueRef ptr =
+                       LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
+
+               /* This enables use of s_load_dword and flat_load_dword for const buffer 0
+                * loads, and up to x4 load opcode merging. However, it leads to horrible
+                * code reducing SIMD wave occupancy from 8 to 2 in many cases.
+                *
+                * Using s_buffer_load_dword (x1) seems to be the best option right now.
+                */
+#if 0 /* keep this codepath disabled */
+               if (!reg->Register.Indirect) {
+                       addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
+                       LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
+                       return bitcast(bld_base, type, result);
+               }
+#endif
+
+               /* Do the bounds checking with a descriptor, because
+                * doing computation and manual bounds checking of 64-bit
+                * addresses generates horrible VALU code with very high
+                * VGPR usage and very low SIMD occupancy.
+                */
+               ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
+               ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
+
+               LLVMValueRef desc_elems[] = {
+                       LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
+                       LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
+                       LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
+                       LLVMConstInt(ctx->i32,
+                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
+               };
+               LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
+               LLVMValueRef result = buffer_load_const(ctx, desc, addr);
+               return bitcast(bld_base, type, result);
+       }
+
         assert(reg->Register.Dimension);
         buf = reg->Dimension.Index;
-       idx = reg->Register.Index * 4 + swizzle;
  
         if (reg->Dimension.Indirect) {
                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
@@ -2012,12 +2064,6 @@ static LLVMValueRef fetch_constant(
         } else
                 bufp = load_const_buffer_desc(ctx, buf);
  
-       if (reg->Register.Indirect) {
-               addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
-       } else {
-               addr = LLVMConstInt(ctx->i32, idx * 4, 0);
-       }
-
         return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
  }
  
@@ -4255,10 +4301,18 @@ static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
                                             struct si_function_info *fninfo,
                                             bool assign_params)
  {
+       LLVMTypeRef const_shader_buf_type;
+
+       if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+           ctx->shader->selector->info.shader_buffers_declared == 0)
+               const_shader_buf_type = ctx->f32;
+       else
+               const_shader_buf_type = ctx->v4i32;
+
         unsigned const_and_shader_buffers =
                 add_arg(fninfo, ARG_SGPR,
-                       si_const_array(ctx->v4i32,
-                                      SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
+                       si_const_array(const_shader_buf_type, 0));
+
         unsigned samplers_and_images =
                 add_arg(fninfo, ARG_SGPR,
                         si_const_array(ctx->v8i32,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index ba80f550e49a218b5c1f6c7d326be498e127c2dd..ebe956e709e7eb6f95e08d601ba665ffa4b90180 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -161,7 +161,7 @@ enum {
         SI_SGPR_RW_BUFFERS_HI,
         SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
         SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
-       SI_SGPR_CONST_AND_SHADER_BUFFERS,
+       SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
         SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
         SI_SGPR_SAMPLERS_AND_IMAGES,
         SI_SGPR_SAMPLERS_AND_IMAGES_HI,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h

index eb1901b5a9e5fccc82ca9f0e6fc425cb05f9b42a..7eb0aa3c925aecf07836a7c7f650300c5c83b429 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -279,6 +279,9 @@ struct si_descriptors {
         ubyte shader_userdata_offset;
         /* The size of one descriptor. */
         ubyte element_dw_size;
+       /* If there is only one slot enabled, bind it directly instead of
+        * uploading descriptors. -1 if disabled. */
+       signed char slot_index_to_bind_directly;
  };
  
  struct si_buffer_resources {
author	Marek Olšák <marek.olsak@amd.com>
	Sun, 8 Oct 2017 01:44:07 +0000 (03:44 +0200)
committer	Marek Olšák <marek.olsak@amd.com>
	Tue, 17 Oct 2017 20:03:03 +0000 (22:03 +0200)
src/gallium/drivers/radeonsi/si_descriptors.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.h		patch \| blob \| history