radeonsi: use faster integer division for instance divisors
authorMarek Olšák <marek.olsak@amd.com>
Sun, 23 Sep 2018 02:02:32 +0000 (22:02 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 16 Oct 2018 21:23:25 +0000 (17:23 -0400)
We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.

This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.

This probably won't affect any apps.

src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h

index acd4d34f89946e42e07cd0e75d500c8fa87ef047..19522cc97b1721a11af6970248c843e7afc10544 100644 (file)
@@ -428,20 +428,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
        }
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-       struct si_shader_context *ctx,
-       unsigned param_start_instance, LLVMValueRef divisor)
-{
-       LLVMValueRef result = ctx->abi.instance_id;
-
-       /* The division must be done before START_INSTANCE is added. */
-       if (divisor != ctx->i32_1)
-               result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-       return LLVMBuildAdd(ctx->ac.builder, result,
-                           LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
@@ -7302,22 +7288,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                bool divisor_is_fetched =
                        key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-               LLVMValueRef index;
-
-               if (divisor_is_one || divisor_is_fetched) {
-                       LLVMValueRef divisor = ctx->i32_1;
-
-                       if (divisor_is_fetched) {
-                               divisor = buffer_load_const(ctx, instance_divisor_constbuf,
-                                                           LLVMConstInt(ctx->i32, i * 4, 0));
-                               divisor = ac_to_integer(&ctx->ac, divisor);
+               LLVMValueRef index = NULL;
+
+               if (divisor_is_one) {
+                       index = ctx->abi.instance_id;
+               } else if (divisor_is_fetched) {
+                       LLVMValueRef udiv_factors[4];
+
+                       for (unsigned j = 0; j < 4; j++) {
+                               udiv_factors[j] =
+                                       buffer_load_const(ctx, instance_divisor_constbuf,
+                                                         LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+                               udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
                        }
+                       /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+                        * Such InstanceID might not be achievable in a reasonable time though.
+                        */
+                       index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+                                                      udiv_factors[0], udiv_factors[1],
+                                                      udiv_factors[2], udiv_factors[3]);
+               }
 
-                       /* InstanceID / Divisor + StartInstance */
-                       index = get_instance_index_for_fetch(ctx,
-                                                            user_sgpr_base +
-                                                            SI_SGPR_START_INSTANCE,
-                                                            divisor);
+               if (divisor_is_one || divisor_is_fetched) {
+                       /* Add StartInstance. */
+                       index = LLVMBuildAdd(ctx->ac.builder, index,
+                                            LLVMGetParam(ctx->main_fn, user_sgpr_base +
+                                                         SI_SGPR_START_INSTANCE), "");
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(ctx->ac.builder,
index 827d8495006031e3030ca736922c2bf4c0e8cdff..8e4cdddf0b9a57e6fd95ea9730893b018144c21c 100644 (file)
@@ -32,6 +32,7 @@
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
 
 static unsigned si_map_swizzle(unsigned swizzle)
 {
@@ -4372,6 +4373,29 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
  * Vertex elements & buffers
  */
 
+struct util_fast_udiv_info32 {
+   unsigned multiplier; /* the "magic number" multiplier */
+   unsigned pre_shift; /* shift for the dividend before multiplying */
+   unsigned post_shift; /* shift for the dividend after multiplying */
+   int increment; /* 0 or 1; if set then increment the numerator, using one of
+                     the two strategies */
+};
+
+static struct util_fast_udiv_info32
+util_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+{
+       struct util_fast_udiv_info info =
+               util_compute_fast_udiv_info(D, num_bits, 32);
+
+       struct util_fast_udiv_info32 result = {
+               info.multiplier,
+               info.pre_shift,
+               info.post_shift,
+               info.increment,
+       };
+       return result;
+}
+
 static void *si_create_vertex_elements(struct pipe_context *ctx,
                                       unsigned count,
                                       const struct pipe_vertex_element *elements)
@@ -4379,6 +4403,12 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
        struct si_screen *sscreen = (struct si_screen*)ctx->screen;
        struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
        bool used[SI_NUM_VERTEX_BUFFERS] = {};
+       struct util_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+       STATIC_ASSERT(sizeof(struct util_fast_udiv_info32) == 16);
+       STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
        int i;
 
        assert(count <= SI_MAX_ATTRIBS);
@@ -4401,14 +4431,17 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                        return NULL;
                }
 
-               if (elements[i].instance_divisor) {
+               unsigned instance_divisor = elements[i].instance_divisor;
+               if (instance_divisor) {
                        v->uses_instance_divisors = true;
-                       v->instance_divisors[i] = elements[i].instance_divisor;
 
-                       if (v->instance_divisors[i] == 1)
+                       if (instance_divisor == 1) {
                                v->instance_divisor_is_one |= 1u << i;
-                       else
+                       } else {
                                v->instance_divisor_is_fetched |= 1u << i;
+                               divisor_factors[i] =
+                                       util_compute_fast_udiv_info32(instance_divisor, 32);
+                       }
                }
 
                if (!used[vbo_index]) {
@@ -4518,6 +4551,22 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                                   S_008F0C_NUM_FORMAT(num_format) |
                                   S_008F0C_DATA_FORMAT(data_format);
        }
+
+       if (v->instance_divisor_is_fetched) {
+               unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+               v->instance_divisor_factor_buffer =
+                       (struct r600_resource*)
+                       pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+                                          num_divisors * sizeof(divisor_factors[0]));
+               if (!v->instance_divisor_factor_buffer) {
+                       FREE(v);
+                       return NULL;
+               }
+               void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
+                                                   NULL, PIPE_TRANSFER_WRITE);
+               memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+       }
        return v;
 }
 
@@ -4541,10 +4590,10 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
        if (v && v->instance_divisor_is_fetched) {
                struct pipe_constant_buffer cb;
 
-               cb.buffer = NULL;
-               cb.user_buffer = v->instance_divisors;
+               cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+               cb.user_buffer = NULL;
                cb.buffer_offset = 0;
-               cb.buffer_size = sizeof(uint32_t) * v->count;
+               cb.buffer_size = 0xffffffff;
                si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
        }
 }
@@ -4552,9 +4601,11 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
+       struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
        if (sctx->vertex_elements == state)
                sctx->vertex_elements = NULL;
+       r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
 
index 16fd223d00af056b52d426b1c94e7f04a321eddb..f52296d11193f2bf47cb4eede1895862aff37c1c 100644 (file)
@@ -132,7 +132,7 @@ struct si_stencil_ref {
 
 struct si_vertex_elements
 {
-       uint32_t                        instance_divisors[SI_MAX_ATTRIBS];
+       struct r600_resource            *instance_divisor_factor_buffer;
        uint32_t                        rsrc_word3[SI_MAX_ATTRIBS];
        uint16_t                        src_offset[SI_MAX_ATTRIBS];
        uint8_t                         fix_fetch[SI_MAX_ATTRIBS];