radeonsi: move instance divisors into a constant buffer
authorMarek Olšák <marek.olsak@amd.com>
Fri, 9 Jun 2017 16:46:07 +0000 (18:46 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 27 Jun 2017 17:55:09 +0000 (19:55 +0200)
Shader key size: 107 -> 47

Divisors of 0 and 1 are encoded in the shader key. Greater instance divisors
are loaded from a constant buffer.

The shader code doing the division is huge. Is it something we need to
worry about? Does any app use instance divisors >= 2?

VS prolog disassembly:
    s_load_dwordx4 s[12:15], s[0:1], 0x80  ; C00A0300 00000080
    s_nop 0                                ; BF800000
    s_waitcnt lgkmcnt(0)                   ; BF8C007F
    s_buffer_load_dword s14, s[12:15], 0x4 ; C0220386 00000004
    s_waitcnt lgkmcnt(0)                   ; BF8C007F
    v_cvt_f32_u32_e32 v4, s14              ; 7E080C0E
    v_rcp_iflag_f32_e32 v4, v4             ; 7E084704
    v_mul_f32_e32 v4, 0x4f800000, v4       ; 0A0808FF 4F800000
    v_cvt_u32_f32_e32 v4, v4               ; 7E080F04
    v_mul_hi_u32 v5, v4, s14               ; D2860005 00001D04
    v_mul_lo_i32 v6, v4, s14               ; D2850006 00001D04
    v_cmp_eq_u32_e64 s[12:13], 0, v5       ; D0CA000C 00020A80
    v_sub_i32_e32 v5, vcc, 0, v6           ; 340A0C80
    v_cndmask_b32_e64 v5, v6, v5, s[12:13] ; D1000005 00320B06
    v_mul_hi_u32 v5, v5, v4                ; D2860005 00020905
    v_add_i32_e32 v6, vcc, v5, v4          ; 320C0905
    v_subrev_i32_e32 v4, vcc, v5, v4       ; 36080905
    v_cndmask_b32_e64 v4, v4, v6, s[12:13] ; D1000004 00320D04
    v_mul_hi_u32 v5, v4, v1                ; D2860005 00020304
    v_add_i32_e32 v4, vcc, s8, v0          ; 32080008
    v_mul_lo_i32 v6, v5, s14               ; D2850006 00001D05
    v_add_i32_e32 v7, vcc, 1, v5           ; 320E0A81
    v_cmp_ge_u32_e64 s[12:13], v1, v6      ; D0CE000C 00020D01
    v_sub_i32_e32 v6, vcc, v1, v6          ; 340C0D01
    v_cmp_le_u32_e32 vcc, s14, v6          ; 7D960C0E
    v_cndmask_b32_e64 v8, 0, -1, s[12:13]  ; D1000008 00318280
    v_cndmask_b32_e64 v6, 0, -1, vcc       ; D1000006 01A98280
    v_and_b32_e32 v6, v8, v6               ; 260C0D08
    v_cmp_eq_u32_e32 vcc, 0, v6            ; 7D940C80
    v_cndmask_b32_e32 v6, v7, v5, vcc      ; 000C0B07
    v_add_i32_e32 v5, vcc, -1, v5          ; 320A0AC1
    v_cmp_eq_u32_e32 vcc, 0, v8            ; 7D941080
    v_cndmask_b32_e32 v5, v6, v5, vcc      ; 000A0B06
    v_add_i32_e32 v5, vcc, s9, v5          ; 320A0A09

v2: set prefer_mono for fetched instance divisors

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index 75d2a1d86cf576178e36be3c0e1f55ae2fd9f521..88f7dcee959e2878478f5cf5a28e3596ae4c4a5b 100644 (file)
@@ -2192,6 +2192,8 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx,
                                               R_00B330_SPI_SHADER_USER_DATA_ES_0);
                        si_emit_shader_pointer(sctx, descs,
                                               R_00B430_SPI_SHADER_USER_DATA_HS_0);
+                       si_emit_shader_pointer(sctx, descs,
+                                              R_00B530_SPI_SHADER_USER_DATA_LS_0);
                }
        }
 
index ff787adcf340e7860046f14cd99549e63ebc8904..5f3b7e112ce1d026d5484838a3df3b9ccae7708b 100644 (file)
@@ -308,6 +308,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 
                si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
                                 &sctx->null_const_buf);
+               si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
+                                &sctx->null_const_buf);
                si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
                                 &sctx->null_const_buf);
                si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
index 42b08bfb84556ca7bfdf8374367168ec1025732b..55d1232512bd1757c2e51987b621b93f34f347cb 100644 (file)
@@ -312,7 +312,7 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 
 static LLVMValueRef get_instance_index_for_fetch(
        struct si_shader_context *ctx,
-       unsigned param_start_instance, unsigned divisor)
+       unsigned param_start_instance, LLVMValueRef divisor)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
 
@@ -320,9 +320,8 @@ static LLVMValueRef get_instance_index_for_fetch(
                                           ctx->param_instance_id);
 
        /* The division must be done before START_INSTANCE is added. */
-       if (divisor > 1)
-               result = LLVMBuildUDiv(gallivm->builder, result,
-                               LLVMConstInt(ctx->i32, divisor, 0), "");
+       if (divisor != ctx->i32_1)
+               result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
 
        return LLVMBuildAdd(gallivm->builder, result,
                            LLVMGetParam(ctx->main_fn, param_start_instance), "");
@@ -5282,12 +5281,10 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
                                  const struct si_vs_prolog_bits *prolog,
                                  const char *prefix, FILE *f)
 {
-       fprintf(f, "  %s.instance_divisors = {", prefix);
-       for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
-               fprintf(f, !i ? "%u" : ", %u",
-                       prolog->instance_divisors[i]);
-       }
-       fprintf(f, "}\n");
+       fprintf(f, "  %s.instance_divisor_is_one = %u\n",
+               prefix, prolog->instance_divisor_is_one);
+       fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
+               prefix, prolog->instance_divisor_is_fetched);
 
        fprintf(f, "  mono.vs.fix_fetch = {");
        for (int i = 0; i < SI_MAX_ATTRIBS; i++)
@@ -5603,10 +5600,12 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
                key->vs_prolog.num_merged_next_stage_vgprs = 5;
        }
 
-       /* Set the instanceID flag. */
-       for (unsigned i = 0; i < info->num_inputs; i++)
-               if (key->vs_prolog.states.instance_divisors[i])
-                       shader_out->info.uses_instanceid = true;
+       /* Enable loading the InstanceID VGPR. */
+       uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
+
+       if ((key->vs_prolog.states.instance_divisor_is_one |
+            key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
+               shader_out->info.uses_instanceid = true;
 }
 
 /**
@@ -6527,6 +6526,21 @@ out:
        return result;
 }
 
+static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
+{
+       struct gallivm_state *gallivm = &ctx->gallivm;
+       LLVMValueRef ptr[2], list;
+
+       /* Get the pointer to rw buffers. */
+       ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
+       ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
+       list = lp_build_gather_values(gallivm, ptr, 2);
+       list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
+       list = LLVMBuildIntToPtr(gallivm->builder, list,
+                                si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
+       return list;
+}
+
 /**
  * Build the vertex shader prolog function.
  *
@@ -6609,11 +6623,33 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        }
 
        /* Compute vertex load indices from instance divisors. */
+       LLVMValueRef instance_divisor_constbuf = NULL;
+
+       if (key->vs_prolog.states.instance_divisor_is_fetched) {
+               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+               LLVMValueRef buf_index =
+                       LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+               instance_divisor_constbuf =
+                       ac_build_indexed_load_const(&ctx->ac, list, buf_index);
+       }
+
        for (i = 0; i <= key->vs_prolog.last_input; i++) {
-               unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+               bool divisor_is_one =
+                       key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+               bool divisor_is_fetched =
+                       key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
                LLVMValueRef index;
 
-               if (divisor) {
+               if (divisor_is_one || divisor_is_fetched) {
+                       LLVMValueRef divisor = ctx->i32_1;
+
+                       if (divisor_is_fetched) {
+                               divisor = buffer_load_const(ctx, instance_divisor_constbuf,
+                                                           LLVMConstInt(ctx->i32, i * 4, 0));
+                               divisor = LLVMBuildBitCast(gallivm->builder, divisor,
+                                                          ctx->i32, "");
+                       }
+
                        /* InstanceID / Divisor + StartInstance */
                        index = get_instance_index_for_fetch(ctx,
                                                             user_sgpr_base +
@@ -6866,15 +6902,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                /* POS_FIXED_PT is always last. */
                unsigned pos = key->ps_prolog.num_input_sgprs +
                               key->ps_prolog.num_input_vgprs - 1;
-               LLVMValueRef ptr[2], list;
-
-               /* Get the pointer to rw buffers. */
-               ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
-               ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
-               list = lp_build_gather_values(gallivm, ptr, 2);
-               list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
-               list = LLVMBuildIntToPtr(gallivm->builder, list,
-                                         si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
+               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 
                si_llvm_emit_polygon_stipple(ctx, list, pos);
        }
index 64321265cf9181c58d41fd95aa829b7ff1804156..a10067d025d017d252355add11930410323c9033 100644 (file)
@@ -385,7 +385,14 @@ struct si_shader_selector {
 
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
-       unsigned        instance_divisors[SI_MAX_ATTRIBS];
+       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+        *   divisor is 0.
+        * - If "is_one" has a bit set, the instance divisor is 1.
+        * - If "is_fetched" has a bit set, the instance divisor will be loaded
+        *   from the constant buffer.
+        */
+       uint16_t        instance_divisor_is_one;     /* bitmask of inputs */
+       uint16_t        instance_divisor_is_fetched; /* bitmask of inputs */
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
index a674a602e3a6773fcdfb226df7fd32121e692d99..7e3d1a02e0734e412eac37bafa2ec5c4727fc702 100644 (file)
@@ -3773,6 +3773,11 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                if (elements[i].instance_divisor) {
                        v->uses_instance_divisors = true;
                        v->instance_divisors[i] = elements[i].instance_divisor;
+
+                       if (v->instance_divisors[i] == 1)
+                               v->instance_divisor_is_one |= 1u << i;
+                       else
+                               v->instance_divisor_is_fetched |= 1u << i;
                }
 
                if (!used[vbo_index]) {
@@ -3901,6 +3906,16 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
             v->uses_instance_divisors || /* we don't check which divisors changed */
             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
                sctx->do_update_shaders = true;
+
+       if (v && v->instance_divisor_is_fetched) {
+               struct pipe_constant_buffer cb;
+
+               cb.buffer = NULL;
+               cb.user_buffer = v->instance_divisors;
+               cb.buffer_offset = 0;
+               cb.buffer_size = sizeof(uint32_t) * v->count;
+               si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
+       }
 }
 
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
index c9e0770c7ba18ae2028f30e6fd94ce8f53a09f82..ec28abaf9a499d4e027eec5b51ae4b09ab025736 100644 (file)
@@ -115,6 +115,8 @@ struct si_vertex_elements
        uint16_t                        first_vb_use_mask;
        /* Vertex buffer descriptor list size aligned for optimal prefetch. */
        uint16_t                        desc_list_byte_size;
+       uint16_t                        instance_divisor_is_one; /* bitmask of inputs */
+       uint16_t                        instance_divisor_is_fetched;  /* bitmask of inputs */
 };
 
 union si_state {
@@ -182,6 +184,7 @@ enum {
        SI_VS_STREAMOUT_BUF3,
 
        SI_HS_CONST_DEFAULT_TESS_LEVELS,
+       SI_VS_CONST_INSTANCE_DIVISORS,
        SI_VS_CONST_CLIP_PLANES,
        SI_PS_CONST_POLY_STIPPLE,
        SI_PS_CONST_SAMPLE_POSITIONS,
index 4eb3b758b4e3d6be14c97f530f46bb5e83176a0f..af3f2a90e2aff04616005cf527322424e307085d 100644 (file)
@@ -1187,10 +1187,18 @@ static void si_shader_selector_key_vs(struct si_context *sctx,
        if (!sctx->vertex_elements)
                return;
 
+       prolog_key->instance_divisor_is_one =
+               sctx->vertex_elements->instance_divisor_is_one;
+       prolog_key->instance_divisor_is_fetched =
+               sctx->vertex_elements->instance_divisor_is_fetched;
+
+       /* Prefer a monolithic shader to allow scheduling divisions around
+        * VBO loads. */
+       if (prolog_key->instance_divisor_is_fetched)
+               key->opt.prefer_mono = 1;
+
        unsigned count = MIN2(vs->info.num_inputs,
                              sctx->vertex_elements->count);
-       memcpy(prolog_key->instance_divisors,
-              sctx->vertex_elements->instance_divisors, count * 4);
        memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count);
 }