radeonsi: separate 2 pieces of code from create_function
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 01f1e4847f16dbe0c5b4104dc9ba201fe67625ad..1a852c5cba25ffc7c6f7a8a3f87de789660ec8e1 100644 (file)
@@ -96,6 +96,7 @@ struct si_shader_context
        LLVMValueRef esgs_ring;
        LLVMValueRef gsvs_ring[4];
        LLVMValueRef gs_next_vertex[4];
+       LLVMValueRef return_value;
 
        LLVMTypeRef voidt;
        LLVMTypeRef i1;
@@ -375,7 +376,7 @@ static LLVMValueRef build_indexed_load_const(
 
 static LLVMValueRef get_instance_index_for_fetch(
        struct radeon_llvm_context *radeon_bld,
-       unsigned divisor)
+       unsigned param_start_instance, unsigned divisor)
 {
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
@@ -389,8 +390,8 @@ static LLVMValueRef get_instance_index_for_fetch(
                result = LLVMBuildUDiv(gallivm->builder, result,
                                lp_build_const_int32(gallivm, divisor), "");
 
-       return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
-                       radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
+       return LLVMBuildAdd(gallivm->builder, result,
+                           LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
 static void declare_input_vs(
@@ -402,7 +403,8 @@ static void declare_input_vs(
        struct gallivm_state *gallivm = base->gallivm;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
-       unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index];
+       unsigned divisor =
+               ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
        unsigned chan;
 
@@ -427,7 +429,9 @@ static void declare_input_vs(
        if (divisor) {
                /* Build index from instance ID, start instance and divisor */
                ctx->shader->uses_instanceid = true;
-               buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor);
+               buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
+                                                           SI_PARAM_START_INSTANCE,
+                                                           divisor);
        } else {
                /* Load the buffer index for vertices. */
                LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
@@ -853,7 +857,7 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 static unsigned select_interp_param(struct si_shader_context *ctx,
                                    unsigned param)
 {
-       if (!ctx->shader->key.ps.force_persample_interp)
+       if (!ctx->shader->key.ps.prolog.force_persample_interp)
                return param;
 
        /* If the shader doesn't use center/centroid, just return the parameter.
@@ -923,7 +927,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
        intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 
        if (semantic_name == TGSI_SEMANTIC_COLOR &&
-           ctx->shader->key.ps.color_two_side) {
+           ctx->shader->key.ps.prolog.color_two_side) {
                LLVMValueRef args[4];
                LLVMValueRef is_face_positive;
                LLVMValueRef back_attr_number;
@@ -1330,12 +1334,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
        if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
                const union si_shader_key *key = &ctx->shader->key;
-               unsigned col_formats = key->ps.spi_shader_col_format;
+               unsigned col_formats = key->ps.epilog.spi_shader_col_format;
                int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
                assert(cbuf >= 0 && cbuf < 8);
                spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-               is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
+               is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
        }
 
        args[4] = uint->zero; /* COMPR flag */
@@ -1488,13 +1492,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-       if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
+       if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
                LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
                                SI_PARAM_ALPHA_REF);
 
                LLVMValueRef alpha_pass =
                        lp_build_cmp(&bld_base->base,
-                                    ctx->shader->key.ps.alpha_func,
+                                    ctx->shader->key.ps.epilog.alpha_func,
                                     alpha, alpha_ref);
                LLVMValueRef arg =
                        lp_build_select(&bld_base->base,
@@ -1511,7 +1515,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 }
 
 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-                                                 LLVMValueRef alpha)
+                                                 LLVMValueRef alpha,
+                                                 unsigned samplemask_param)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1519,7 +1524,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
 
        /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
        coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
-                               SI_PARAM_SAMPLE_COVERAGE);
+                               samplemask_param);
        coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
        coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
@@ -1989,7 +1994,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                  invocation_id, bld_base->uint_bld.zero, ""));
 
        /* Determine the layout of one tess factor element in the buffer. */
-       switch (shader->key.tcs.prim_mode) {
+       switch (shader->key.tcs.epilog.prim_mode) {
        case PIPE_PRIM_LINES:
                stride = 2; /* 2 dwords, 1 vec2 store */
                outer_comps = 2;
@@ -2284,6 +2289,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 
 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
                                LLVMValueRef *color, unsigned index,
+                               unsigned samplemask_param,
                                bool is_last)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2291,30 +2297,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
        int i;
 
        /* Clamp color */
-       if (ctx->shader->key.ps.clamp_color)
+       if (ctx->shader->key.ps.epilog.clamp_color)
                for (i = 0; i < 4; i++)
                        color[i] = radeon_llvm_saturate(bld_base, color[i]);
 
        /* Alpha to one */
-       if (ctx->shader->key.ps.alpha_to_one)
+       if (ctx->shader->key.ps.epilog.alpha_to_one)
                color[3] = base->one;
 
        /* Alpha test */
        if (index == 0 &&
-           ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+           ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
                si_alpha_test(bld_base, color[3]);
 
        /* Line & polygon smoothing */
-       if (ctx->shader->key.ps.poly_line_smoothing)
-               color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+       if (ctx->shader->key.ps.epilog.poly_line_smoothing)
+               color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
+                                                        samplemask_param);
 
        /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-       if (ctx->shader->key.ps.last_cbuf > 0) {
+       if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
                LLVMValueRef args[8][9];
                int c, last = -1;
 
                /* Get the export arguments, also find out what the last one is. */
-               for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+               for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
                        si_llvm_init_export_args(bld_base, color,
                                                 V_008DFC_SQ_EXP_MRT + c, args[c]);
                        if (args[c][0] != bld_base->uint_bld.zero)
@@ -2322,7 +2329,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
                }
 
                /* Emit all exports. */
-               for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+               for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
                        if (is_last && last == c) {
                                args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
                                args[c][2] = bld_base->uint_bld.one; /* DONE bit */
@@ -2385,11 +2392,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
         * Otherwise, find the last color export.
         */
        if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
-               unsigned spi_format = shader->key.ps.spi_shader_col_format;
+               unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
 
                /* Don't export NULL and return if alpha-test is enabled. */
-               if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS &&
-                   shader->key.ps.alpha_func != PIPE_FUNC_NEVER &&
+               if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
+                   shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
                    (spi_format & 0xf) == 0)
                        spi_format |= V_028714_SPI_SHADER_32_AR;
 
@@ -2400,10 +2407,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                                continue;
 
                        /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-                       if (shader->key.ps.last_cbuf > 0) {
+                       if (shader->key.ps.epilog.last_cbuf > 0) {
                                /* Just set this if any of the colorbuffers are enabled. */
                                if (spi_format &
-                                   ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1))
+                                   ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
                                        last_color_export = i;
                                continue;
                        }
@@ -2445,6 +2452,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                                                         ctx->radeon_bld.soa.outputs[i][j], "");
 
                        si_export_mrt_color(bld_base, color, semantic_index,
+                                           SI_PARAM_SAMPLE_COVERAGE,
                                            last_color_export == i);
                        break;
                default:
@@ -3546,6 +3554,30 @@ static const struct lp_build_tgsi_action interp_action = {
        .emit = build_interp_intrinsic,
 };
 
+static void si_create_function(struct si_shader_context *ctx,
+                              LLVMTypeRef *returns, unsigned num_returns,
+                              LLVMTypeRef *params, unsigned num_params,
+                              int last_array_pointer, int last_sgpr)
+{
+       int i;
+
+       radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
+                               params, num_params);
+       radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
+       ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
+
+       for (i = 0; i <= last_sgpr; ++i) {
+               LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+
+               /* We tell llvm that array inputs are passed by value to allow Sinking pass
+                * to move load. Inputs are constant so this is fine. */
+               if (i <= last_array_pointer)
+                       LLVMAddAttribute(P, LLVMByValAttribute);
+               else
+                       LLVMAddAttribute(P, LLVMInRegAttribute);
+       }
+}
+
 static void create_meta_data(struct si_shader_context *ctx)
 {
        struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -3579,6 +3611,47 @@ static void declare_streamout_params(struct si_shader_context *ctx,
        }
 }
 
+static unsigned llvm_get_type_size(LLVMTypeRef type)
+{
+       LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+       switch (kind) {
+       case LLVMIntegerTypeKind:
+               return LLVMGetIntTypeWidth(type) / 8;
+       case LLVMFloatTypeKind:
+               return 4;
+       case LLVMPointerTypeKind:
+               return 8;
+       case LLVMVectorTypeKind:
+               return LLVMGetVectorSize(type) *
+                      llvm_get_type_size(LLVMGetElementType(type));
+       default:
+               assert(0);
+               return 0;
+       }
+}
+
+static void declare_tess_lds(struct si_shader_context *ctx)
+{
+       struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+       LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
+
+       /* This is the upper bound, maximum is 32 inputs times 32 vertices */
+       unsigned vertex_data_dw_size = 32*32*4;
+       unsigned patch_data_dw_size = 32*4;
+       /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+       unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+       unsigned lds_dwords = patch_dw_size;
+
+       /* The actual size is computed outside of the shader to reduce
+        * the number of shader variants. */
+       ctx->lds =
+               LLVMAddGlobalInAddressSpace(gallivm->module,
+                                           LLVMArrayType(i32, lds_dwords),
+                                           "tess_lds",
+                                           LOCAL_ADDR_SPACE);
+}
+
 static void create_function(struct si_shader_context *ctx)
 {
        struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
@@ -3711,19 +3784,22 @@ static void create_function(struct si_shader_context *ctx)
        }
 
        assert(num_params <= Elements(params));
-       radeon_llvm_create_func(&ctx->radeon_bld, params, num_params);
-       radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
 
-       for (i = 0; i <= last_sgpr; ++i) {
-               LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+       si_create_function(ctx, NULL, 0, params,
+                          num_params, last_array_pointer, last_sgpr);
 
-               /* We tell llvm that array inputs are passed by value to allow Sinking pass
-                * to move load. Inputs are constant so this is fine. */
-               if (i <= last_array_pointer)
-                       LLVMAddAttribute(P, LLVMByValAttribute);
-               else
-                       LLVMAddAttribute(P, LLVMInRegAttribute);
-       }
+       shader->num_input_sgprs = 0;
+       shader->num_input_vgprs = 0;
+
+       for (i = 0; i <= last_sgpr; ++i)
+               shader->num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+
+       /* Unused fragment shader inputs are eliminated by the compiler,
+        * so we don't know yet how many there will be.
+        */
+       if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
+               for (; i < num_params; ++i)
+                       shader->num_input_vgprs += llvm_get_type_size(params[i]) / 4;
 
        if (bld_base->info &&
            (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
@@ -3740,22 +3816,8 @@ static void create_function(struct si_shader_context *ctx)
 
        if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
            ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
-           ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
-               /* This is the upper bound, maximum is 32 inputs times 32 vertices */
-               unsigned vertex_data_dw_size = 32*32*4;
-               unsigned patch_data_dw_size = 32*4;
-               /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
-               unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
-               unsigned lds_dwords = patch_dw_size;
-
-               /* The actual size is computed outside of the shader to reduce
-                * the number of shader variants. */
-               ctx->lds =
-                       LLVMAddGlobalInAddressSpace(gallivm->module,
-                                                   LLVMArrayType(ctx->i32, lds_dwords),
-                                                   "tess_lds",
-                                                   LOCAL_ADDR_SPACE);
-       }
+           ctx->type == TGSI_PROCESSOR_TESS_EVAL)
+               declare_tess_lds(ctx);
 }
 
 static void preload_constants(struct si_shader_context *ctx)
@@ -4241,7 +4303,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
 
-       LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+       LLVMBuildRet(gallivm->builder, ctx->return_value);
 
        /* Dump LLVM IR before any optimization passes */
        if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
@@ -4278,35 +4340,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
        switch (shader) {
        case PIPE_SHADER_VERTEX:
                fprintf(f, "  instance_divisors = {");
-               for (i = 0; i < Elements(key->vs.instance_divisors); i++)
+               for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
                        fprintf(f, !i ? "%u" : ", %u",
-                               key->vs.instance_divisors[i]);
+                               key->vs.prolog.instance_divisors[i]);
                fprintf(f, "}\n");
                fprintf(f, "  as_es = %u\n", key->vs.as_es);
                fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
-               fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
+               fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
                break;
 
        case PIPE_SHADER_TESS_CTRL:
-               fprintf(f, "  prim_mode = %u\n", key->tcs.prim_mode);
+               fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
                break;
 
        case PIPE_SHADER_TESS_EVAL:
                fprintf(f, "  as_es = %u\n", key->tes.as_es);
-               fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
+               fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
                break;
 
        case PIPE_SHADER_GEOMETRY:
                break;
 
        case PIPE_SHADER_FRAGMENT:
-               fprintf(f, "  spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
-               fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
-               fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
-               fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
-               fprintf(f, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
-               fprintf(f, "  poly_stipple = %u\n", key->ps.poly_stipple);
-               fprintf(f, "  clamp_color = %u\n", key->ps.clamp_color);
+               fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
+               fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
+               fprintf(f, "  prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
+               fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
+               fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
+               fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
+               fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
+               fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
+               fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
+               fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
                break;
 
        default:
@@ -4323,7 +4388,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        struct lp_build_tgsi_context *bld_base;
 
        memset(ctx, 0, sizeof(*ctx));
-       radeon_llvm_context_init(&ctx->radeon_bld);
+       radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
        ctx->tm = tm;
        ctx->screen = sscreen;
        if (shader && shader->selector)
@@ -4336,7 +4401,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
-       ctx->i128 = LLVMInt128TypeInContext(ctx->radeon_bld.gallivm.context);
+       ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
        ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
        ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
@@ -4374,12 +4439,10 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
        bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 
-       if (HAVE_LLVM >= 0x0306) {
-               bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
-               bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
-               bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
-               bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
-       }
+       bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
+       bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
+       bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
+       bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
 }
 
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
@@ -4394,7 +4457,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
        LLVMModuleRef mod;
        int r = 0;
        bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-                           shader->key.ps.poly_stipple;
+                           shader->key.ps.prolog.poly_stipple;
 
        if (poly_stipple) {
                tokens = util_pstipple_create_fragment_shader(tokens, NULL,
@@ -4477,7 +4540,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                goto out;
        }
 
-       LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+       LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
        mod = bld_base->base.gallivm->module;
 
        /* Dump LLVM IR before any optimization passes */
@@ -4504,6 +4567,47 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
        radeon_llvm_dispose(&ctx.radeon_bld);
 
+       /* Calculate the number of fragment input VGPRs. */
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+               shader->num_input_vgprs = 0;
+               shader->face_vgpr_index = -1;
+
+               if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 3;
+               if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 2;
+               if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
+                       shader->face_vgpr_index = shader->num_input_vgprs;
+                       shader->num_input_vgprs += 1;
+               }
+               if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+               if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
+                       shader->num_input_vgprs += 1;
+       }
+
        if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
                shader->gs_copy_shader->selector = shader->selector;