gallium/radeon: eliminate fast color clear before sharing
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index bb30c0f5019685313bbf81e54277f0755136ef39..8c1151aa493cb6f430fa3b94486ba9985b7e27dc 100644 (file)
@@ -128,8 +128,7 @@ static struct si_shader_context *si_shader_context(
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct si_shader *shader,
-                              LLVMTargetMachineRef tm,
-                              struct tgsi_shader_info *info);
+                              LLVMTargetMachineRef tm);
 
 /* Ideally pass the sample mask input to the PS epilog as v13, which
  * is its usual location, so that the shader doesn't have to add v_mov.
@@ -215,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx,
        LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
                                          param);
 
+       if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+               value = bitcast(&ctx->radeon_bld.soa.bld_base,
+                               TGSI_TYPE_UNSIGNED, value);
+
        if (rshift)
                value = LLVMBuildLShr(gallivm->builder, value,
                                      lp_build_const_int32(gallivm, rshift), "");
@@ -450,7 +453,7 @@ static void declare_input_vs(
                                            input_index);
        } else if (divisor) {
                /* Build index from instance ID, start instance and divisor */
-               ctx->shader->uses_instanceid = true;
+               ctx->shader->info.uses_instanceid = true;
                buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
                                                            SI_PARAM_START_INSTANCE,
                                                            divisor);
@@ -1890,7 +1893,8 @@ handle_semantic:
                case TGSI_SEMANTIC_COLOR:
                case TGSI_SEMANTIC_BCOLOR:
                        target = V_008DFC_SQ_EXP_PARAM + param_count;
-                       shader->vs_output_param_offset[i] = param_count;
+                       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+                       shader->info.vs_output_param_offset[i] = param_count;
                        param_count++;
                        break;
                case TGSI_SEMANTIC_CLIPDIST:
@@ -1904,7 +1908,8 @@ handle_semantic:
                case TGSI_SEMANTIC_TEXCOORD:
                case TGSI_SEMANTIC_GENERIC:
                        target = V_008DFC_SQ_EXP_PARAM + param_count;
-                       shader->vs_output_param_offset[i] = param_count;
+                       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+                       shader->info.vs_output_param_offset[i] = param_count;
                        param_count++;
                        break;
                default:
@@ -1932,7 +1937,7 @@ handle_semantic:
                }
        }
 
-       shader->nr_param_exports = param_count;
+       shader->info.nr_param_exports = param_count;
 
        /* We need to add the position output manually if it's missing. */
        if (!pos_args[0][0]) {
@@ -1994,7 +1999,7 @@ handle_semantic:
 
        for (i = 0; i < 4; i++)
                if (pos_args[i][0])
-                       shader->nr_pos_exports++;
+                       shader->info.nr_pos_exports++;
 
        pos_idx = 0;
        for (i = 0; i < 4; i++) {
@@ -2004,7 +2009,7 @@ handle_semantic:
                /* Specify the target we are exporting */
                pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
 
-               if (pos_idx == shader->nr_pos_exports)
+               if (pos_idx == shader->info.nr_pos_exports)
                        /* Specify that this is the last export */
                        pos_args[i][2] = uint->one;
 
@@ -2729,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 /**
  * Load an image view, fmask view. or sampler state descriptor.
  */
-static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
-                                    LLVMValueRef index, enum desc_type type)
+static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
+                                           LLVMValueRef list, LLVMValueRef index,
+                                           enum desc_type type)
 {
        struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
        LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
-                                       SI_PARAM_SAMPLERS);
 
        switch (type) {
        case DESC_IMAGE:
@@ -2751,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
                /* The sampler state is at [12:15]. */
                index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
                index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
-               ptr = LLVMBuildPointerCast(builder, ptr,
-                                          const_array(ctx->v4i32, 0), "");
+               list = LLVMBuildPointerCast(builder, list,
+                                           const_array(ctx->v4i32, 0), "");
                break;
        }
 
-       return build_indexed_load_const(ctx, ptr, index);
+       return build_indexed_load_const(ctx, list, index);
+}
+
+static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
+                                    LLVMValueRef index, enum desc_type type)
+{
+       LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
+                                        SI_PARAM_SAMPLERS);
+
+       return get_sampler_desc_custom(ctx, list, index, type);
 }
 
 static void tex_fetch_ptrs(
@@ -3988,7 +4001,7 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_FRONT_FACE] = ctx->i32;
                params[SI_PARAM_ANCILLARY] = ctx->i32;
                params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
-               params[SI_PARAM_POS_FIXED_PT] = ctx->f32;
+               params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
                num_params = SI_PARAM_POS_FIXED_PT+1;
 
                if (!ctx->is_monolithic) {
@@ -4044,21 +4057,22 @@ static void create_function(struct si_shader_context *ctx)
                                          S_0286D0_LINEAR_SAMPLE_ENA(1) |
                                          S_0286D0_LINEAR_CENTER_ENA(1) |
                                          S_0286D0_LINEAR_CENTROID_ENA(1) |
-                                         S_0286D0_FRONT_FACE_ENA(1));
+                                         S_0286D0_FRONT_FACE_ENA(1) |
+                                         S_0286D0_POS_FIXED_PT_ENA(1));
        }
 
-       shader->num_input_sgprs = 0;
-       shader->num_input_vgprs = 0;
+       shader->info.num_input_sgprs = 0;
+       shader->info.num_input_vgprs = 0;
 
        for (i = 0; i <= last_sgpr; ++i)
-               shader->num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+               shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
 
        /* Unused fragment shader inputs are eliminated by the compiler,
         * so we don't know yet how many there will be.
         */
        if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
                for (; i < num_params; ++i)
-                       shader->num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+                       shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
 
        if (bld_base->info &&
            (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
@@ -4208,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
        }
 }
 
+static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
+                                        LLVMValueRef param_sampler_views,
+                                        unsigned param_pos_fixed_pt)
+{
+       struct lp_build_tgsi_context *bld_base =
+               &ctx->radeon_bld.soa.bld_base;
+       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct lp_build_emit_data result = {};
+       struct tgsi_full_instruction inst = {};
+       LLVMValueRef desc, sampler_index, address[2], pix;
+
+       /* Use the fixed-point gl_FragCoord input.
+        * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+        * per coordinate to get the repeating effect.
+        */
+       address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+       address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+       /* Load the sampler view descriptor. */
+       sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
+       desc = get_sampler_desc_custom(ctx, param_sampler_views,
+                                      sampler_index, DESC_IMAGE);
+
+       /* Load the texel. */
+       inst.Instruction.Opcode = TGSI_OPCODE_TXF;
+       inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
+       result.inst = &inst;
+       set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
+                          inst.Texture.Texture,
+                          desc, NULL, address, ARRAY_SIZE(address), 0xf);
+       build_tex_intrinsic(&tex_action, bld_base, &result);
+
+       /* Kill the thread accordingly. */
+       pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
+                                     lp_build_const_int32(gallivm, 3), "");
+       pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
+       pix = LLVMBuildFNeg(gallivm->builder, pix, "");
+
+       lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+                          LLVMVoidTypeInContext(gallivm->context),
+                          &pix, 1, 0);
+}
+
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
                                  struct si_shader_config *conf,
                                  unsigned symbol_offset)
@@ -4349,14 +4406,14 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 
 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
                                       struct pipe_debug_callback *debug,
-                                      const char *name)
+                                      const char *name, FILE *file)
 {
        char *line, *p;
        unsigned i, count;
 
        if (binary->disasm_string) {
-               fprintf(stderr, "Shader %s disassembly:\n", name);
-               fprintf(stderr, "%s", binary->disasm_string);
+               fprintf(file, "Shader %s disassembly:\n", name);
+               fprintf(file, "%s", binary->disasm_string);
 
                if (debug && debug->debug_message) {
                        /* Very long debug messages are cut off, so send the
@@ -4386,9 +4443,9 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
                                           "Shader Disassembly End");
                }
        } else {
-               fprintf(stderr, "Shader %s binary:\n", name);
+               fprintf(file, "Shader %s binary:\n", name);
                for (i = 0; i < binary->code_size; i += 4) {
-                       fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+                       fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
                                binary->code[i + 3], binary->code[i + 2],
                                binary->code[i + 1], binary->code[i]);
                }
@@ -4400,7 +4457,8 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                                 unsigned num_inputs,
                                 unsigned code_size,
                                 struct pipe_debug_callback *debug,
-                                unsigned processor)
+                                unsigned processor,
+                                FILE *file)
 {
        unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
        unsigned lds_per_wave = 0;
@@ -4436,15 +4494,16 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
        if (lds_per_wave)
                max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
 
-       if (r600_can_dump_shader(&sscreen->b, processor)) {
+       if (file != stderr ||
+           r600_can_dump_shader(&sscreen->b, processor)) {
                if (processor == TGSI_PROCESSOR_FRAGMENT) {
-                       fprintf(stderr, "*** SHADER CONFIG ***\n"
+                       fprintf(file, "*** SHADER CONFIG ***\n"
                                "SPI_PS_INPUT_ADDR = 0x%04x\n"
                                "SPI_PS_INPUT_ENA  = 0x%04x\n",
                                conf->spi_ps_input_addr, conf->spi_ps_input_ena);
                }
 
-               fprintf(stderr, "*** SHADER STATS ***\n"
+               fprintf(file, "*** SHADER STATS ***\n"
                        "SGPRS: %d\n"
                        "VGPRS: %d\n"
                        "Code Size: %d bytes\n"
@@ -4465,28 +4524,63 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                           max_simd_waves);
 }
 
+static const char *si_get_shader_name(struct si_shader *shader,
+                                     unsigned processor)
+{
+       switch (processor) {
+       case TGSI_PROCESSOR_VERTEX:
+               if (shader->key.vs.as_es)
+                       return "Vertex Shader as ES";
+               else if (shader->key.vs.as_ls)
+                       return "Vertex Shader as LS";
+               else
+                       return "Vertex Shader as VS";
+       case TGSI_PROCESSOR_TESS_CTRL:
+               return "Tessellation Control Shader";
+       case TGSI_PROCESSOR_TESS_EVAL:
+               if (shader->key.tes.as_es)
+                       return "Tessellation Evaluation Shader as ES";
+               else
+                       return "Tessellation Evaluation Shader as VS";
+       case TGSI_PROCESSOR_GEOMETRY:
+               if (shader->gs_copy_shader == NULL)
+                       return "GS Copy Shader as VS";
+               else
+                       return "Geometry Shader";
+       case TGSI_PROCESSOR_FRAGMENT:
+               return "Pixel Shader";
+       case TGSI_PROCESSOR_COMPUTE:
+               return "Compute Shader";
+       default:
+               return "Unknown Shader";
+       }
+}
+
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-                   struct pipe_debug_callback *debug, unsigned processor)
+                   struct pipe_debug_callback *debug, unsigned processor,
+                   FILE *file)
 {
-       if (r600_can_dump_shader(&sscreen->b, processor) &&
-           !(sscreen->b.debug_flags & DBG_NO_ASM)) {
-               fprintf(stderr, "\n");
+       if (file != stderr ||
+           (r600_can_dump_shader(&sscreen->b, processor) &&
+            !(sscreen->b.debug_flags & DBG_NO_ASM))) {
+               fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
 
                if (shader->prolog)
                        si_shader_dump_disassembly(&shader->prolog->binary,
-                                                  debug, "prolog");
+                                                  debug, "prolog", file);
 
-               si_shader_dump_disassembly(&shader->binary, debug, "main");
+               si_shader_dump_disassembly(&shader->binary, debug, "main", file);
 
                if (shader->epilog)
                        si_shader_dump_disassembly(&shader->epilog->binary,
-                                                  debug, "epilog");
-               fprintf(stderr, "\n");
+                                                  debug, "epilog", file);
+               fprintf(file, "\n");
        }
 
        si_shader_dump_stats(sscreen, &shader->config,
                             shader->selector ? shader->selector->info.num_inputs : 0,
-                            si_get_shader_binary_size(shader), debug, processor);
+                            si_get_shader_binary_size(shader), debug, processor,
+                            file);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -4571,7 +4665,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
-       si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo);
+       si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
        ctx->type = TGSI_PROCESSOR_VERTEX;
        ctx->is_gs_copy_shader = true;
 
@@ -4634,7 +4728,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
                if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
                        fprintf(stderr, "GS Copy Shader:\n");
                si_shader_dump(sscreen, ctx->shader, debug,
-                              TGSI_PROCESSOR_GEOMETRY);
+                              TGSI_PROCESSOR_GEOMETRY, stderr);
                r = si_shader_binary_upload(sscreen, ctx->shader);
        }
 
@@ -4695,8 +4789,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct si_shader *shader,
-                              LLVMTargetMachineRef tm,
-                              struct tgsi_shader_info *info)
+                              LLVMTargetMachineRef tm)
 {
        struct lp_build_tgsi_context *bld_base;
 
@@ -4724,7 +4817,8 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
        bld_base = &ctx->radeon_bld.soa.bld_base;
-       bld_base->info = info;
+       if (shader && shader->selector)
+               bld_base->info = &shader->selector->info;
        bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
        bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4760,43 +4854,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
 }
 
-static int si_compile_tgsi_shader(struct si_screen *sscreen,
-                                 LLVMTargetMachineRef tm,
-                                 struct si_shader *shader,
-                                 bool is_monolithic,
-                                 struct pipe_debug_callback *debug)
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+                          LLVMTargetMachineRef tm,
+                          struct si_shader *shader,
+                          bool is_monolithic,
+                          struct pipe_debug_callback *debug)
 {
        struct si_shader_selector *sel = shader->selector;
-       struct tgsi_token *tokens = sel->tokens;
        struct si_shader_context ctx;
        struct lp_build_tgsi_context *bld_base;
-       struct tgsi_shader_info stipple_shader_info;
        LLVMModuleRef mod;
        int r = 0;
-       bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-                           shader->key.ps.prolog.poly_stipple;
-
-       if (poly_stipple) {
-               tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-                                               SI_POLY_STIPPLE_SAMPLER,
-                                               TGSI_FILE_SYSTEM_VALUE);
-               tgsi_scan_shader(tokens, &stipple_shader_info);
-       }
 
        /* Dump TGSI code before doing TGSI->LLVM conversion in case the
         * conversion fails. */
        if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
            !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
                si_dump_shader_key(sel->type, &shader->key, stderr);
-               tgsi_dump(tokens, 0);
+               tgsi_dump(sel->tokens, 0);
                si_dump_streamout(&sel->so);
        }
 
-       si_init_shader_ctx(&ctx, sscreen, shader, tm,
-                          poly_stipple ? &stipple_shader_info : &sel->info);
+       si_init_shader_ctx(&ctx, sscreen, shader, tm);
        ctx.is_monolithic = is_monolithic;
 
-       shader->uses_instanceid = sel->info.uses_instanceid;
+       shader->info.uses_instanceid = sel->info.uses_instanceid;
 
        bld_base = &ctx.radeon_bld.soa.bld_base;
        ctx.radeon_bld.load_system_value = declare_system_value;
@@ -4847,6 +4929,14 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen,
        preload_streamout_buffers(&ctx);
        preload_ring_buffers(&ctx);
 
+       if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
+           shader->key.ps.prolog.poly_stipple) {
+               LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
+                                                 SI_PARAM_SAMPLERS);
+               si_llvm_emit_polygon_stipple(&ctx, views,
+                                            SI_PARAM_POS_FIXED_PT);
+       }
+
        if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -4856,7 +4946,7 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen,
                }
        }
 
-       if (!lp_build_tgsi_llvm(bld_base, tokens)) {
+       if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
                fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
                goto out;
        }
@@ -4882,43 +4972,43 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen,
 
        /* Calculate the number of fragment input VGPRs. */
        if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
-               shader->num_input_vgprs = 0;
-               shader->face_vgpr_index = -1;
+               shader->info.num_input_vgprs = 0;
+               shader->info.face_vgpr_index = -1;
 
                if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 3;
+                       shader->info.num_input_vgprs += 3;
                if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 2;
+                       shader->info.num_input_vgprs += 2;
                if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
-                       shader->face_vgpr_index = shader->num_input_vgprs;
-                       shader->num_input_vgprs += 1;
+                       shader->info.face_vgpr_index = shader->info.num_input_vgprs;
+                       shader->info.num_input_vgprs += 1;
                }
                if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
                if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
-                       shader->num_input_vgprs += 1;
+                       shader->info.num_input_vgprs += 1;
        }
 
        if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
@@ -4936,8 +5026,6 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen,
 out:
        for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
                FREE(ctx.constants[i]);
-       if (poly_stipple)
-               tgsi_free_tokens(tokens);
        return r;
 }
 
@@ -5020,7 +5108,7 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen,
        int last_sgpr, num_params, num_returns, i;
        bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
        ctx.type = TGSI_PROCESSOR_VERTEX;
        ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
        ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
@@ -5128,7 +5216,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen,
        int num_params, i;
        bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, NULL, tm, NULL);
+       si_init_shader_ctx(&ctx, sscreen, NULL, tm);
        ctx.type = TGSI_PROCESSOR_VERTEX;
 
        /* Declare input VGPRs. */
@@ -5196,10 +5284,11 @@ static bool si_get_vs_epilog(struct si_screen *sscreen,
        /* Set up the PrimitiveID output. */
        if (shader->key.vs.epilog.export_prim_id) {
                unsigned index = shader->selector->info.num_outputs;
-               unsigned offset = shader->nr_param_exports++;
+               unsigned offset = shader->info.nr_param_exports++;
 
                epilog_key.vs_epilog.prim_id_param_offset = offset;
-               shader->vs_output_param_offset[index] = offset;
+               assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
+               shader->info.vs_output_param_offset[index] = offset;
        }
 
        shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
@@ -5223,7 +5312,7 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen,
        /* Get the prolog. */
        memset(&prolog_key, 0, sizeof(prolog_key));
        prolog_key.vs_prolog.states = shader->key.vs.prolog;
-       prolog_key.vs_prolog.num_input_sgprs = shader->num_input_sgprs;
+       prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
        prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
 
        /* The prolog is a no-op if there are no inputs. */
@@ -5245,7 +5334,7 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen,
        /* Set the instanceID flag. */
        for (i = 0; i < info->num_inputs; i++)
                if (prolog_key.vs_prolog.states.instance_divisors[i])
-                       shader->uses_instanceid = true;
+                       shader->info.uses_instanceid = true;
 
        return true;
 }
@@ -5285,7 +5374,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen,
        int last_array_pointer, last_sgpr, num_params;
        bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
        ctx.type = TGSI_PROCESSOR_TESS_CTRL;
        shader.key.tcs.epilog = key->tcs_epilog.states;
 
@@ -5374,7 +5463,7 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
        int last_sgpr, num_params, num_returns, i, num_color_channels;
        bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
        ctx.type = TGSI_PROCESSOR_FRAGMENT;
        shader.key.ps.prolog = key->ps_prolog.states;
 
@@ -5412,6 +5501,24 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
 
+       /* Polygon stippling. */
+       if (key->ps_prolog.states.poly_stipple) {
+               /* POS_FIXED_PT is always last. */
+               unsigned pos = key->ps_prolog.num_input_sgprs +
+                              key->ps_prolog.num_input_vgprs - 1;
+               LLVMValueRef ptr[2], views;
+
+               /* Get the pointer to sampler views. */
+               ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
+               ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
+               views = lp_build_gather_values(gallivm, ptr, 2);
+               views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
+               views = LLVMBuildIntToPtr(gallivm->builder, views,
+                                         const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
+
+               si_llvm_emit_polygon_stipple(&ctx, views, pos);
+       }
+
        /* Interpolate colors. */
        for (i = 0; i < 2; i++) {
                unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
@@ -5486,8 +5593,6 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
                                                   linear_sample[i], base + 10 + i, "");
        }
 
-       /* TODO: polygon stippling */
-
        /* Compile. */
        LLVMBuildRet(gallivm->builder, ret);
        radeon_llvm_finalize_module(&ctx.radeon_bld);
@@ -5520,7 +5625,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen,
        int last_array_pointer, last_sgpr, num_params, i;
        bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
        ctx.type = TGSI_PROCESSOR_FRAGMENT;
        shader.key.ps.epilog = key->ps_epilog.states;
 
@@ -5635,8 +5740,8 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
        memset(&prolog_key, 0, sizeof(prolog_key));
        prolog_key.ps_prolog.states = shader->key.ps.prolog;
        prolog_key.ps_prolog.colors_read = info->colors_read;
-       prolog_key.ps_prolog.num_input_sgprs = shader->num_input_sgprs;
-       prolog_key.ps_prolog.num_input_vgprs = shader->num_input_vgprs;
+       prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+       prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
 
        if (info->colors_read) {
                unsigned *color = shader->selector->color_attr_index;
@@ -5644,7 +5749,7 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
                if (shader->key.ps.prolog.color_two_side) {
                        /* BCOLORs are stored after the last input. */
                        prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
-                       prolog_key.ps_prolog.face_vgpr_index = shader->face_vgpr_index;
+                       prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
                        shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
                }
 
@@ -5740,6 +5845,12 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
        if (!shader->epilog)
                return false;
 
+       /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+       if (shader->key.ps.prolog.poly_stipple) {
+               shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+               assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+       }
+
        /* Set up the enable bits for per-sample shading if needed. */
        if (shader->key.ps.prolog.force_persample_interp) {
                if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
@@ -5783,15 +5894,48 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                     struct si_shader *shader,
                     struct pipe_debug_callback *debug)
 {
+       struct si_shader *mainp = shader->selector->main_shader_part;
        int r;
 
-       /* Compile TGSI. */
-       r = si_compile_tgsi_shader(sscreen, tm, shader,
-                                  sscreen->use_monolithic_shaders, debug);
-       if (r)
-               return r;
+       /* LS and ES are always compiled on demand. */
+       if (!mainp ||
+           (shader->selector->type == PIPE_SHADER_VERTEX &&
+            (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+           (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
+            shader->key.tes.as_es)) {
+               /* Monolithic shader (compiled as a whole, has many variants,
+                * may take a long time to compile).
+                */
+               r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+               if (r)
+                       return r;
+       } else {
+               /* The shader consists of 2-3 parts:
+                *
+                * - the middle part is the user shader, it has 1 variant only
+                *   and it was compiled during the creation of the shader
+                *   selector
+                * - the prolog part is inserted at the beginning
+                * - the epilog part is inserted at the end
+                *
+                * The prolog and epilog have many (but simple) variants.
+                */
 
-       if (!sscreen->use_monolithic_shaders) {
+               /* Copy the compiled TGSI shader data over. */
+               shader->is_binary_shared = true;
+               shader->binary = mainp->binary;
+               shader->config = mainp->config;
+               shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+               shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+               shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+               memcpy(shader->info.vs_output_param_offset,
+                      mainp->info.vs_output_param_offset,
+                      sizeof(mainp->info.vs_output_param_offset));
+               shader->info.uses_instanceid = mainp->info.uses_instanceid;
+               shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+               shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+               /* Select prologs and/or epilogs. */
                switch (shader->selector->type) {
                case PIPE_SHADER_VERTEX:
                        if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
@@ -5813,7 +5957,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                         * are allocated inputs.
                         */
                        shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->num_input_vgprs);
+                                                       shader->info.num_input_vgprs);
                        break;
                }
 
@@ -5832,7 +5976,8 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                }
        }
 
-       si_shader_dump(sscreen, shader, debug, shader->selector->info.processor);
+       si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
+                      stderr);
 
        /* Upload. */
        r = si_shader_binary_upload(sscreen, shader);
@@ -5856,5 +6001,6 @@ void si_shader_destroy(struct si_shader *shader)
 
        r600_resource_reference(&shader->bo, NULL);
 
-       radeon_shader_binary_clean(&shader->binary);
+       if (!shader->is_binary_shared)
+               radeon_shader_binary_clean(&shader->binary);
 }