radeonsi: have separate LS and ES main shader parts in the shader selector
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index e483fe4923ac03f3232de3fa71d1da714caee3d7..de427789aae8aa5dc3df534d112e5cd75040714a 100644 (file)
@@ -98,14 +98,6 @@ enum {
        LOCAL_ADDR_SPACE = 3,
 };
 
-#define SENDMSG_GS 2
-#define SENDMSG_GS_DONE 3
-
-#define SENDMSG_GS_OP_NOP      (0 << 4)
-#define SENDMSG_GS_OP_CUT      (1 << 4)
-#define SENDMSG_GS_OP_EMIT     (2 << 4)
-#define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
-
 /**
  * Returns a unique index for a semantic name and index. The index must be
  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
@@ -327,6 +319,21 @@ static LLVMValueRef get_instance_index_for_fetch(
                            LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
+/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
+ * to float. */
+static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
+                                           LLVMValueRef vec4,
+                                           unsigned double_index)
+{
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
+       LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
+                                             LLVMVectorType(f64, 2), "");
+       LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
+       LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
+       return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
+}
+
 static void declare_input_vs(
        struct si_shader_context *ctx,
        unsigned input_index,
@@ -338,14 +345,15 @@ static void declare_input_vs(
 
        unsigned chan;
        unsigned fix_fetch;
+       unsigned num_fetches;
+       unsigned fetch_stride;
 
        LLVMValueRef t_list_ptr;
        LLVMValueRef t_offset;
        LLVMValueRef t_list;
-       LLVMValueRef attribute_offset;
-       LLVMValueRef buffer_index;
+       LLVMValueRef vertex_index;
        LLVMValueRef args[3];
-       LLVMValueRef input;
+       LLVMValueRef input[3];
 
        /* Load the T list */
        t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS);
@@ -354,29 +362,55 @@ static void declare_input_vs(
 
        t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 
-       /* Build the attribute offset */
-       attribute_offset = lp_build_const_int32(gallivm, 0);
-
-       buffer_index = LLVMGetParam(ctx->main_fn,
+       vertex_index = LLVMGetParam(ctx->main_fn,
                                    ctx->param_vertex_index0 +
                                    input_index);
 
+       fix_fetch = ctx->shader->key.mono.vs.fix_fetch[input_index];
+
+       /* Do multiple loads for special formats. */
+       switch (fix_fetch) {
+       case SI_FIX_FETCH_RGB_64_FLOAT:
+               num_fetches = 3; /* 3 2-dword loads */
+               fetch_stride = 8;
+               break;
+       case SI_FIX_FETCH_RGBA_64_FLOAT:
+               num_fetches = 2; /* 2 4-dword loads */
+               fetch_stride = 16;
+               break;
+       case SI_FIX_FETCH_RGB_8:
+       case SI_FIX_FETCH_RGB_8_INT:
+               num_fetches = 3;
+               fetch_stride = 1;
+               break;
+       case SI_FIX_FETCH_RGB_16:
+       case SI_FIX_FETCH_RGB_16_INT:
+               num_fetches = 3;
+               fetch_stride = 2;
+               break;
+       default:
+               num_fetches = 1;
+               fetch_stride = 0;
+       }
+
        args[0] = t_list;
-       args[1] = attribute_offset;
-       args[2] = buffer_index;
-       input = lp_build_intrinsic(gallivm->builder,
-               "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
-               LP_FUNC_ATTR_READNONE);
+       args[2] = vertex_index;
+
+       for (unsigned i = 0; i < num_fetches; i++) {
+               args[1] = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
+
+               input[i] = lp_build_intrinsic(gallivm->builder,
+                       "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
+                       LP_FUNC_ATTR_READNONE);
+       }
 
        /* Break up the vec4 into individual components */
        for (chan = 0; chan < 4; chan++) {
                LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
                out[chan] = LLVMBuildExtractElement(gallivm->builder,
-                                                   input, llvm_chan, "");
+                                                   input[0], llvm_chan, "");
        }
 
-       fix_fetch = (ctx->shader->key.mono.vs.fix_fetch >> (4 * input_index)) & 0xf;
-
        switch (fix_fetch) {
        case SI_FIX_FETCH_A2_SNORM:
        case SI_FIX_FETCH_A2_SSCALED:
@@ -472,6 +506,42 @@ static void declare_input_vs(
                                                    out[chan], ctx->f32, "");
                }
                break;
+       case SI_FIX_FETCH_RG_64_FLOAT:
+               for (chan = 0; chan < 2; chan++)
+                       out[chan] = extract_double_to_float(ctx, input[0], chan);
+
+               out[2] = LLVMConstReal(ctx->f32, 0);
+               out[3] = LLVMConstReal(ctx->f32, 1);
+               break;
+       case SI_FIX_FETCH_RGB_64_FLOAT:
+               for (chan = 0; chan < 3; chan++)
+                       out[chan] = extract_double_to_float(ctx, input[chan], 0);
+
+               out[3] = LLVMConstReal(ctx->f32, 1);
+               break;
+       case SI_FIX_FETCH_RGBA_64_FLOAT:
+               for (chan = 0; chan < 4; chan++) {
+                       out[chan] = extract_double_to_float(ctx, input[chan / 2],
+                                                           chan % 2);
+               }
+               break;
+       case SI_FIX_FETCH_RGB_8:
+       case SI_FIX_FETCH_RGB_8_INT:
+       case SI_FIX_FETCH_RGB_16:
+       case SI_FIX_FETCH_RGB_16_INT:
+               for (chan = 0; chan < 3; chan++) {
+                       out[chan] = LLVMBuildExtractElement(gallivm->builder,
+                                                           input[chan],
+                                                           ctx->i32_0, "");
+               }
+               if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
+                   fix_fetch == SI_FIX_FETCH_RGB_16) {
+                       out[3] = LLVMConstReal(ctx->f32, 1);
+               } else {
+                       out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
+                                                 ctx->f32, "");
+               }
+               break;
        }
 }
 
@@ -2572,13 +2642,9 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
-       LLVMValueRef args[2];
 
-       args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
-       args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
-       lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-                          ctx->voidt, args, 2, 0);
+       ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
+                       LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));
 }
 
 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -4395,16 +4461,12 @@ static void tex_fetch_args(
                struct lp_build_context *uint_bld = &bld_base->uint_bld;
                struct lp_build_emit_data txf_emit_data = *emit_data;
                LLVMValueRef txf_address[4];
-               unsigned txf_count = count;
+               /* We only need .xy for non-arrays, and .xyz for arrays. */
+               unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
                struct tgsi_full_instruction inst = {};
 
                memcpy(txf_address, address, sizeof(txf_address));
 
-               if (target == TGSI_TEXTURE_2D_MSAA) {
-                       txf_address[2] = bld_base->uint_bld.zero;
-               }
-               txf_address[3] = bld_base->uint_bld.zero;
-
                /* Read FMASK using TXF. */
                inst.Instruction.Opcode = TGSI_OPCODE_TXF;
                inst.Texture.Texture = target;
@@ -4425,7 +4487,7 @@ static void tex_fetch_args(
                                                txf_emit_data.output[0],
                                                uint_bld->zero, "");
 
-               unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
+               unsigned sample_chan = txf_count; /* the sample index is last */
 
                LLVMValueRef sample_index4 =
                        LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
@@ -4722,35 +4784,6 @@ static void si_llvm_emit_txqs(
        emit_data->output[emit_data->chan] = samples;
 }
 
-/*
- * SI implements derivatives using the local data store (LDS)
- * All writes to the LDS happen in all executing threads at
- * the same time. TID is the Thread ID for the current
- * thread and is a value between 0 and 63, representing
- * the thread's position in the wavefront.
- *
- * For the pixel shader threads are grouped into quads of four pixels.
- * The TIDs of the pixels of a quad are:
- *
- *  +------+------+
- *  |4n + 0|4n + 1|
- *  +------+------+
- *  |4n + 2|4n + 3|
- *  +------+------+
- *
- * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
- * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
- * the current pixel's column, and masking with 0xfffffffe yields the TID
- * of the left pixel of the current pixel's row.
- *
- * Adding 1 yields the TID of the pixel to the right of the left pixel, and
- * adding 2 yields the TID of the pixel below the top pixel.
- */
-/* masks for thread ID. */
-#define TID_MASK_TOP_LEFT 0xfffffffc
-#define TID_MASK_TOP      0xfffffffd
-#define TID_MASK_LEFT     0xfffffffe
-
 static void si_llvm_emit_ddxy(
        const struct lp_build_tgsi_action *action,
        struct lp_build_tgsi_context *bld_base,
@@ -4759,59 +4792,24 @@ static void si_llvm_emit_ddxy(
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
        unsigned opcode = emit_data->info->opcode;
-       LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2];
+       LLVMValueRef val;
        int idx;
        unsigned mask;
 
-       thread_id = ac_get_thread_id(&ctx->ac);
-
        if (opcode == TGSI_OPCODE_DDX_FINE)
-               mask = TID_MASK_LEFT;
+               mask = AC_TID_MASK_LEFT;
        else if (opcode == TGSI_OPCODE_DDY_FINE)
-               mask = TID_MASK_TOP;
+               mask = AC_TID_MASK_TOP;
        else
-               mask = TID_MASK_TOP_LEFT;
-
-       tl_tid = LLVMBuildAnd(gallivm->builder, thread_id,
-                               lp_build_const_int32(gallivm, mask), "");
+               mask = AC_TID_MASK_TOP_LEFT;
 
        /* for DDX we want to next X pixel, DDY next Y pixel. */
        idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
-       trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid,
-                                 lp_build_const_int32(gallivm, idx), "");
 
        val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
-
-       if (ctx->screen->has_ds_bpermute) {
-               args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
-                                      lp_build_const_int32(gallivm, 4), "");
-               args[1] = val;
-               tl = lp_build_intrinsic(gallivm->builder,
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                       args, 2, LP_FUNC_ATTR_READNONE);
-
-               args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
-                                      lp_build_const_int32(gallivm, 4), "");
-               trbl = lp_build_intrinsic(gallivm->builder,
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                         args, 2, LP_FUNC_ATTR_READNONE);
-       } else {
-               LLVMValueRef store_ptr, load_ptr0, load_ptr1;
-
-               store_ptr = ac_build_gep0(&ctx->ac, ctx->lds, thread_id);
-               load_ptr0 = ac_build_gep0(&ctx->ac, ctx->lds, tl_tid);
-               load_ptr1 = ac_build_gep0(&ctx->ac, ctx->lds, trbl_tid);
-
-               LLVMBuildStore(gallivm->builder, val, store_ptr);
-               tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
-               trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
-       }
-
-       tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
-       trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
-
-       emit_data->output[emit_data->chan] =
-               LLVMBuildFSub(gallivm->builder, trbl, tl, "");
+       val = ac_emit_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
+                          mask, idx, ctx->lds, val);
+       emit_data->output[emit_data->chan] = val;
 }
 
 /*
@@ -5013,7 +5011,6 @@ static void si_llvm_emit_vertex(
                                            SI_PARAM_GS2VS_OFFSET);
        LLVMValueRef gs_next_vertex;
        LLVMValueRef can_emit, kill;
-       LLVMValueRef args[2];
        unsigned chan, offset;
        int i;
        unsigned stream;
@@ -5085,11 +5082,8 @@ static void si_llvm_emit_vertex(
        LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 
        /* Signal vertex emission */
-       args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
-       args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
-       lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-                          ctx->voidt, args, 2, 0);
-
+       ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                       LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));       
        if (!use_kill)
                lp_build_endif(&if_state);
 }
@@ -5101,16 +5095,12 @@ static void si_llvm_emit_primitive(
        struct lp_build_emit_data *emit_data)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
-       LLVMValueRef args[2];
        unsigned stream;
 
        /* Signal primitive cut */
        stream = si_llvm_get_stream(bld_base, emit_data);
-       args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
-       args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
-       lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-                          ctx->voidt, args, 2, 0);
+       ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+                       LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));
 }
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
@@ -5816,7 +5806,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
        r600_resource_reference(&shader->bo, NULL);
        shader->bo = (struct r600_resource*)
                     pipe_buffer_create(&sscreen->b.b, 0,
-                                       PIPE_USAGE_IMMUTABLE, bo_size);
+                                       PIPE_USAGE_IMMUTABLE,
+                                       align(bo_size, SI_CPDMA_ALIGNMENT));
        if (!shader->bo)
                return -ENOMEM;
 
@@ -5982,8 +5973,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                           conf->spilled_vgprs, conf->private_mem_vgprs);
 }
 
-static const char *si_get_shader_name(struct si_shader *shader,
-                                     unsigned processor)
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
 {
        switch (processor) {
        case PIPE_SHADER_VERTEX:
@@ -6310,7 +6300,11 @@ static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
                fprintf(f, "  part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id);
                fprintf(f, "  as_es = %u\n", key->as_es);
                fprintf(f, "  as_ls = %u\n", key->as_ls);
-               fprintf(f, "  mono.vs.fix_fetch = 0x%"PRIx64"\n", key->mono.vs.fix_fetch);
+
+               fprintf(f, "  mono.vs.fix_fetch = {");
+               for (i = 0; i < SI_MAX_ATTRIBS; i++)
+                       fprintf(f, !i ? "%u" : ", %u", key->mono.vs.fix_fetch[i]);
+               fprintf(f, "}\n");
                break;
 
        case PIPE_SHADER_TESS_CTRL:
@@ -8246,7 +8240,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                     struct pipe_debug_callback *debug)
 {
        struct si_shader_selector *sel = shader->selector;
-       struct si_shader *mainp = sel->main_shader_part;
+       struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
        int r;
 
        /* LS, ES, VS are compiled on demand if the main part hasn't been