radeonsi: before storing tess levels, load them from LDS instead of temporary
authorMarek Olšák <marek.olsak@amd.com>
Sun, 2 Aug 2015 23:34:32 +0000 (01:34 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 6 Aug 2015 18:44:36 +0000 (20:44 +0200)
Also use only one store if stride <= 4.
All the fetches from and stores to temporaries can be removed now.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91461

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
src/gallium/drivers/radeonsi/si_shader.c

index 92382e85985b5f8ceeed58165852231d7589d3cf..61d36430bcf2b5448038a94d34eac06420a278e5 100644 (file)
@@ -681,18 +681,8 @@ static LLVMValueRef fetch_output_tcs(
                enum tgsi_opcode_type type, unsigned swizzle)
 {
        struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-       struct si_shader *shader = si_shader_ctx->shader;
-       struct tgsi_shader_info *info = &shader->selector->info;
-       unsigned name = info->output_semantic_name[reg->Register.Index];
        LLVMValueRef dw_addr, stride;
 
-       /* Just read the local temp "output" register to get TESSOUTER/INNER. */
-       if (!reg->Register.Indirect &&
-           (name == TGSI_SEMANTIC_TESSOUTER ||
-            name == TGSI_SEMANTIC_TESSINNER)) {
-               return radeon_llvm_emit_fetch(bld_base, reg, type, swizzle);
-       }
-
        if (reg->Register.Dimension) {
                stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
                dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
@@ -731,8 +721,6 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
                             LLVMValueRef dst[4])
 {
        struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-       struct si_shader *shader = si_shader_ctx->shader;
-       struct tgsi_shader_info *sinfo = &shader->selector->info;
        const struct tgsi_full_dst_register *reg = &inst->Dst[0];
        unsigned chan_index;
        LLVMValueRef dw_addr, stride;
@@ -746,14 +734,6 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
                return;
        }
 
-       /* Write tessellation levels to "output" temp registers.
-        * Also write them to LDS as per-patch outputs (below).
-        */
-       if (!reg->Register.Indirect &&
-           (sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSINNER ||
-             sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSOUTER))
-               radeon_llvm_emit_store(bld_base, inst, info, dst);
-
        if (reg->Register.Dimension) {
                stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
                dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
@@ -1854,58 +1834,78 @@ handle_semantic:
        }
 }
 
-static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
-                                 unsigned name, LLVMValueRef *out_ptr)
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-       struct si_shader *shader = si_shader_ctx->shader;
-       struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+       struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct si_shader *shader = si_shader_ctx->shader;
+       unsigned tess_inner_index, tess_outer_index;
+       LLVMValueRef lds_base, lds_inner, lds_outer;
        LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
-       LLVMValueRef output, out[4];
+       LLVMValueRef out[6], vec0, vec1, invocation_id;
        unsigned stride, outer_comps, inner_comps, i;
+       struct lp_build_if_state if_ctx;
 
-       if (name != TGSI_SEMANTIC_TESSOUTER &&
-           name != TGSI_SEMANTIC_TESSINNER) {
-               assert(0);
-               return;
-       }
+       invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
 
+       /* Do this only for invocation 0, because the tess levels are per-patch,
+        * not per-vertex.
+        *
+        * This can't jump, because invocation 0 executes this. It should
+        * at least mask out the loads and stores for other invocations.
+        */
+       lp_build_if(&if_ctx, gallivm,
+                   LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
+                                 invocation_id, bld_base->uint_bld.zero, ""));
+
+       /* Determine the layout of one tess factor element in the buffer. */
        switch (shader->key.tcs.prim_mode) {
        case PIPE_PRIM_LINES:
-               stride = 2;
+               stride = 2; /* 2 dwords, 1 vec2 store */
                outer_comps = 2;
                inner_comps = 0;
                break;
        case PIPE_PRIM_TRIANGLES:
-               stride = 4;
+               stride = 4; /* 4 dwords, 1 vec4 store */
                outer_comps = 3;
                inner_comps = 1;
                break;
        case PIPE_PRIM_QUADS:
-               stride = 6;
+               stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
                outer_comps = 4;
                inner_comps = 2;
                break;
        default:
                assert(0);
+               return;
        }
 
-       /* Load the outputs as i32. */
-       for (i = 0; i < 4; i++)
-               out[i] = LLVMBuildBitCast(gallivm->builder,
-                               LLVMBuildLoad(gallivm->builder, out_ptr[i], ""),
-                               bld_base->uint_bld.elem_type, "");
-
-       /* Convert the outputs to vectors. */
-       if (name == TGSI_SEMANTIC_TESSOUTER)
-               output = lp_build_gather_values(gallivm, out,
-                                               util_next_power_of_two(outer_comps));
-       else if (inner_comps > 1)
-               output = lp_build_gather_values(gallivm, out, inner_comps);
-       else if (inner_comps == 1)
-               output = out[0];
-       else
-               return;
+       /* Load tess_inner and tess_outer from LDS.
+        * Any invocation can write them, so we can't get them from a temporary.
+        */
+       tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
+       tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
+
+       lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+       lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+                                lp_build_const_int32(gallivm,
+                                                     tess_inner_index * 4), "");
+       lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+                                lp_build_const_int32(gallivm,
+                                                     tess_outer_index * 4), "");
+
+       for (i = 0; i < outer_comps; i++)
+               out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
+       for (i = 0; i < inner_comps; i++)
+               out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+
+       /* Convert the outputs to vectors for stores. */
+       vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
+       vec1 = NULL;
+
+       if (stride > 4)
+               vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
        /* Get the buffer. */
        rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1913,22 +1913,20 @@ static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
        buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
                        lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
 
-       /* Get offsets. */
+       /* Get the offset. */
        tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
                               SI_PARAM_TESS_FACTOR_OFFSET);
        rel_patch_id = get_rel_patch_id(si_shader_ctx);
        byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
                                  lp_build_const_int32(gallivm, 4 * stride), "");
 
-       /* Store the output. */
-       if (name == TGSI_SEMANTIC_TESSOUTER) {
-               build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
-                                          outer_comps, byteoffset, tf_base, 0);
-       } else if (inner_comps) {
-               build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
-                                          inner_comps, byteoffset, tf_base,
-                                          outer_comps * 4);
-       }
+       /* Store the outputs. */
+       build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0,
+                                  MIN2(stride, 4), byteoffset, tf_base, 0);
+       if (vec1)
+               build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1,
+                                          stride - 4, byteoffset, tf_base, 16);
+       lp_build_endif(&if_ctx);
 }
 
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
@@ -1962,26 +1960,6 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
        }
 }
 
-static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context * bld_base)
-{
-       struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-       struct si_shader *shader = si_shader_ctx->shader;
-       struct tgsi_shader_info *info = &shader->selector->info;
-       unsigned i;
-
-       /* Only write tessellation factors. Other outputs have already been
-        * written to LDS by instructions. */
-       for (i = 0; i < info->num_outputs; i++) {
-               LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
-               unsigned name = info->output_semantic_name[i];
-
-               if (name == TGSI_SEMANTIC_TESSINNER ||
-                   name == TGSI_SEMANTIC_TESSOUTER) {
-                       si_write_tess_factors(si_shader_ctx, name, out_ptr);
-               }
-       }
-}
-
 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 {
        struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);