winsys/radeon: fold cs_set_flush_callback into cs_create
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 42a3fc6a73b778509ae105c7215e564c2f3dcff0..e4390eeac1f353c150ce452eb51bce237220f6b7 100644 (file)
@@ -50,6 +50,7 @@ struct si_shader_output_values
        LLVMValueRef values[4];
        unsigned name;
        unsigned index;
+       unsigned sid;
        unsigned usage;
 };
 
@@ -169,6 +170,7 @@ static int si_store_shader_io_attribs(struct si_shader *shader,
                assert(i < Elements(shader->input));
                shader->input[i].name = d->Semantic.Name;
                shader->input[i].sid = d->Semantic.Index;
+               shader->input[i].index = d->Range.First;
                shader->input[i].interpolate = d->Interp.Interpolate;
                shader->input[i].centroid = d->Interp.Centroid;
                return -1;
@@ -258,7 +260,9 @@ static void declare_input_gs(
        struct si_shader *shader = &si_shader_ctx->shader->shader;
 
        si_store_shader_io_attribs(shader, decl);
-       shader->input[input_index].param_offset = shader->nparam++;
+
+       if (decl->Semantic.Name != TGSI_SEMANTIC_PRIMID)
+               shader->input[input_index].param_offset = shader->nparam++;
 }
 
 static LLVMValueRef fetch_input_gs(
@@ -279,6 +283,15 @@ static LLVMValueRef fetch_input_gs(
        LLVMValueRef args[9];
        unsigned vtx_offset_param;
 
+       if (swizzle != ~0 &&
+           shader->input[reg->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
+               if (swizzle == 0)
+                       return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                           SI_PARAM_PRIMITIVE_ID);
+               else
+                       return uint->zero;
+       }
+
        if (!reg->Register.Dimension)
                return NULL;
 
@@ -306,10 +319,10 @@ static LLVMValueRef fetch_input_gs(
                                      4);
 
        /* Load the ESGS ring resource descriptor */
-       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                 SI_PARAM_RW_BUFFERS);
        t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
-                                   lp_build_const_int32(gallivm,
-                                                        NUM_PIPE_CONST_BUFFERS + 1));
+                                   lp_build_const_int32(gallivm, SI_RING_ESGS));
 
        args[0] = t_list;
        args[1] = vtx_offset;
@@ -830,6 +843,9 @@ static void build_tbuffer_store(struct si_shader_context *shader,
                LLVMConstInt(i32, tfe, 0)
        };
 
+       /* The instruction offset field has 12 bits */
+       assert(offen || inst_offset < (1 << 12));
+
        /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
        unsigned func = CLAMP(num_channels, 1, 3) - 1;
        const char *types[] = {"i32", "v2i32", "v4i32"};
@@ -864,7 +880,9 @@ static void build_streamout_store(struct si_shader_context *shader,
 
 /* On SI, the vertex shader is responsible for writing streamout data
  * to buffers. */
-static void si_llvm_emit_streamout(struct si_shader_context *shader)
+static void si_llvm_emit_streamout(struct si_shader_context *shader,
+                                  struct si_shader_output_values *outputs,
+                                  unsigned noutput)
 {
        struct pipe_stream_output_info *so = &shader->shader->selector->so;
        struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
@@ -925,8 +943,6 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader)
                        so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
                }
 
-               LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS] = shader->radeon_bld.soa.outputs;
-
                /* Write streamout data. */
                for (i = 0; i < so->num_outputs; i++) {
                        unsigned buf_idx = so->output[i].output_buffer;
@@ -941,10 +957,22 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader)
 
                        /* Load the output as int. */
                        for (j = 0; j < num_comps; j++) {
-                               out[j] = LLVMBuildLoad(builder, outputs[reg][start+j], "");
-                               out[j] = LLVMBuildBitCast(builder, out[j], i32, "");
+                               unsigned outidx = 0;
+
+                               while (outidx < noutput && outputs[outidx].index != reg)
+                                       outidx++;
+
+                               if (outidx < noutput)
+                                       out[j] = LLVMBuildBitCast(builder,
+                                                                 outputs[outidx].values[start+j],
+                                                                 i32, "");
+                               else
+                                       out[j] = NULL;
                        }
 
+                       if (!out[0])
+                               continue;
+
                        /* Pack the output. */
                        LLVMValueRef vdata = NULL;
 
@@ -993,13 +1021,13 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
        unsigned pos_idx;
        int i;
 
-       if (si_shader_ctx->shader->selector->so.num_outputs) {
-               si_llvm_emit_streamout(si_shader_ctx);
+       if (outputs && si_shader_ctx->shader->selector->so.num_outputs) {
+               si_llvm_emit_streamout(si_shader_ctx, outputs, noutput);
        }
 
        for (i = 0; i < noutput; i++) {
                semantic_name = outputs[i].name;
-               semantic_index = outputs[i].index;
+               semantic_index = outputs[i].sid;
                semantic_usage = outputs[i].usage;
 
 handle_semantic:
@@ -1040,6 +1068,7 @@ handle_semantic:
                case TGSI_SEMANTIC_CLIPVERTEX:
                        si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
                        continue;
+               case TGSI_SEMANTIC_PRIMID:
                case TGSI_SEMANTIC_FOG:
                case TGSI_SEMANTIC_GENERIC:
                        target = V_008DFC_SQ_EXP_PARAM + param_count;
@@ -1150,9 +1179,12 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 {
        struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
-       struct si_pipe_shader *shader = si_shader_ctx->shader;
+       struct si_shader *es = &si_shader_ctx->shader->shader;
+       struct si_shader *gs = si_shader_ctx->gs_for_vs;
        struct tgsi_parse_context *parse = &si_shader_ctx->parse;
        LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+       LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                           SI_PARAM_ES2GS_OFFSET);
        LLVMValueRef t_list_ptr;
        LLVMValueRef t_list;
        unsigned chan;
@@ -1167,34 +1199,38 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
                if (parse->FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
                        continue;
 
-               si_store_shader_io_attribs(&shader->shader, d);
+               si_store_shader_io_attribs(es, d);
        }
 
        /* Load the ESGS ring resource descriptor */
-       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                 SI_PARAM_RW_BUFFERS);
        t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
-                                   lp_build_const_int32(gallivm,
-                                                        NUM_PIPE_CONST_BUFFERS + 1));
+                                   lp_build_const_int32(gallivm, SI_RING_ESGS));
 
-       for (i = 0; i < shader->shader.noutput; i++) {
+       for (i = 0; i < es->noutput; i++) {
                LLVMValueRef *out_ptr =
-                       si_shader_ctx->radeon_bld.soa.outputs[shader->shader.output[i].index];
+                       si_shader_ctx->radeon_bld.soa.outputs[es->output[i].index];
+               int j;
+
+               for (j = 0; j < gs->ninput; j++) {
+                       if (gs->input[j].name == es->output[i].name &&
+                           gs->input[j].sid == es->output[i].sid)
+                               break;
+               }
+               if (j == gs->ninput)
+                       continue;
 
                for (chan = 0; chan < 4; chan++) {
                        LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
-                       LLVMValueRef voffset =
-                               lp_build_const_int32(gallivm, (4 * i + chan) * 4);
-                       LLVMValueRef soffset =
-                               LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-                                            SI_PARAM_ES2GS_OFFSET);
-
                        out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
 
                        build_tbuffer_store(si_shader_ctx, t_list, out_val, 1,
-                                           voffset, soffset, 0,
+                                           LLVMGetUndef(i32), soffset,
+                                           (4 * gs->input[j].param_offset + chan) * 4,
                                            V_008F0C_BUF_DATA_FORMAT_32,
                                            V_008F0C_BUF_NUM_FORMAT_UINT,
-                                           1, 0, 1, 1, 0);
+                                           0, 0, 1, 1, 0);
                }
        }
 }
@@ -1239,8 +1275,9 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
                outputs = REALLOC(outputs, noutput * sizeof(outputs[0]),
                                  (noutput + 1) * sizeof(outputs[0]));
                for (index = d->Range.First; index <= d->Range.Last; index++) {
+                       outputs[noutput].index = index;
                        outputs[noutput].name = d->Semantic.Name;
-                       outputs[noutput].index = d->Semantic.Index;
+                       outputs[noutput].sid = d->Semantic.Index;
                        outputs[noutput].usage = d->Declaration.UsageMask;
 
                        for (i = 0; i < 4; i++)
@@ -1865,7 +1902,10 @@ static void si_llvm_emit_vertex(
        struct si_shader *shader = &si_shader_ctx->shader->shader;
        struct gallivm_state *gallivm = bld_base->base.gallivm;
        LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+       LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                           SI_PARAM_GS2VS_OFFSET);
        LLVMValueRef gs_next_vertex;
+       LLVMValueRef can_emit, kill;
        LLVMValueRef t_list_ptr;
        LLVMValueRef t_list;
        LLVMValueRef args[2];
@@ -1873,10 +1913,10 @@ static void si_llvm_emit_vertex(
        int i;
 
        /* Load the GSVS ring resource descriptor */
-       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                 SI_PARAM_RW_BUFFERS);
        t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
-                                   lp_build_const_int32(gallivm,
-                                                        NUM_PIPE_CONST_BUFFERS + 2));
+                                   lp_build_const_int32(gallivm, SI_RING_GSVS));
 
        if (shader->noutput == 0) {
                struct tgsi_parse_context *parse = &si_shader_ctx->parse;
@@ -1895,15 +1935,27 @@ static void si_llvm_emit_vertex(
 
        /* Write vertex attribute values to GSVS ring */
        gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
+
+       /* If this thread has already emitted the declared maximum number of
+        * vertices, kill it: excessive vertex emissions are not supposed to
+        * have any effect, and GS threads have no externally observable
+        * effects other than emitting vertices.
+        */
+       can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
+                                lp_build_const_int32(gallivm,
+                                                     shader->gs_max_out_vertices), "");
+       kill = lp_build_select(&bld_base->base, can_emit,
+                              lp_build_const_float(gallivm, 1.0f),
+                              lp_build_const_float(gallivm, -1.0f));
+       build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+                       LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
+
        for (i = 0; i < shader->noutput; i++) {
                LLVMValueRef *out_ptr =
                        si_shader_ctx->radeon_bld.soa.outputs[shader->output[i].index];
 
                for (chan = 0; chan < 4; chan++) {
                        LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
-                       LLVMValueRef soffset =
-                               LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-                                            SI_PARAM_GS2VS_OFFSET);
                        LLVMValueRef voffset =
                                lp_build_const_int32(gallivm, (i * 4 + chan) *
                                                     shader->gs_max_out_vertices);
@@ -2005,7 +2057,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
        struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
        struct gallivm_state *gallivm = bld_base->base.gallivm;
        struct si_pipe_shader *shader = si_shader_ctx->shader;
-       LLVMTypeRef params[21], f32, i8, i32, v2i32, v3i32;
+       LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32;
        unsigned i, last_sgpr, num_params;
 
        i8 = LLVMInt8TypeInContext(gallivm->context);
@@ -2016,6 +2068,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
        params[SI_PARAM_CONST] = LLVMPointerType(
                LLVMArrayType(LLVMVectorType(i8, 16), NUM_CONST_BUFFERS), CONST_ADDR_SPACE);
+       params[SI_PARAM_RW_BUFFERS] = params[SI_PARAM_CONST];
+
        /* We assume at most 16 textures per program at the moment.
         * This need probably need to be changed to support bindless textures */
        params[SI_PARAM_SAMPLER] = LLVMPointerType(
@@ -2026,7 +2080,6 @@ static void create_function(struct si_shader_context *si_shader_ctx)
        switch (si_shader_ctx->type) {
        case TGSI_PROCESSOR_VERTEX:
                params[SI_PARAM_VERTEX_BUFFER] = params[SI_PARAM_CONST];
-               params[SI_PARAM_SO_BUFFER] = params[SI_PARAM_CONST];
                params[SI_PARAM_START_INSTANCE] = i32;
                num_params = SI_PARAM_START_INSTANCE+1;
                if (shader->key.vs.as_es) {
@@ -2218,16 +2271,19 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
        struct gallivm_state * gallivm = bld_base->base.gallivm;
        unsigned i;
 
-       if (!si_shader_ctx->shader->selector->so.num_outputs)
+       if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX ||
+           si_shader_ctx->shader->key.vs.as_es ||
+           !si_shader_ctx->shader->selector->so.num_outputs)
                return;
 
        LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-                                           SI_PARAM_SO_BUFFER);
+                                           SI_PARAM_RW_BUFFERS);
 
        /* Load the resources, we rely on the code sinking to do the rest */
        for (i = 0; i < 4; ++i) {
                if (si_shader_ctx->shader->selector->so.stride[i]) {
-                       LLVMValueRef offset = lp_build_const_int32(gallivm, i);
+                       LLVMValueRef offset = lp_build_const_int32(gallivm,
+                                                                  SI_RW_SO + i);
 
                        si_shader_ctx->so_buffers[i] = build_indexed_load(si_shader_ctx, buf_ptr, offset);
                }
@@ -2239,7 +2295,7 @@ int si_compile_llvm(struct si_context *sctx, struct si_pipe_shader *shader,
 {
        unsigned i;
        uint32_t *ptr;
-       struct radeon_llvm_binary binary;
+       struct radeon_shader_binary binary;
        bool dump = r600_can_dump_shader(&sctx->screen->b,
                        shader->selector ? shader->selector->tokens : NULL);
        memset(&binary, 0, sizeof(binary));
@@ -2293,9 +2349,9 @@ int si_compile_llvm(struct si_context *sctx, struct si_pipe_shader *shader,
        }
 
        ptr = (uint32_t*)sctx->b.ws->buffer_map(shader->bo->cs_buf, sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
-       if (0 /*SI_BIG_ENDIAN*/) {
+       if (SI_BIG_ENDIAN) {
                for (i = 0; i < binary.code_size / 4; ++i) {
-                       ptr[i] = util_bswap32(*(uint32_t*)(binary.code + i*4));
+                       ptr[i] = util_cpu_to_le32((*(uint32_t*)(binary.code + i*4)));
                }
        } else {
                memcpy(ptr, binary.code, binary.code_size);
@@ -2317,6 +2373,7 @@ static int si_generate_gs_copy_shader(struct si_context *sctx,
        struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
        struct lp_build_context *base = &bld_base->base;
        struct lp_build_context *uint = &bld_base->uint_bld;
+       struct si_shader *shader = &si_shader_ctx->shader->shader;
        struct si_shader *gs = &si_shader_ctx->shader->selector->current->shader;
        struct si_shader_output_values *outputs;
        LLVMValueRef t_list_ptr, t_list;
@@ -2335,10 +2392,10 @@ static int si_generate_gs_copy_shader(struct si_context *sctx,
        preload_streamout_buffers(si_shader_ctx);
 
        /* Load the GSVS ring resource descriptor */
-       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+                                 SI_PARAM_RW_BUFFERS);
        t_list = build_indexed_load(si_shader_ctx, t_list_ptr,
-                                   lp_build_const_int32(gallivm,
-                                                        NUM_PIPE_CONST_BUFFERS + 1));
+                                   lp_build_const_int32(gallivm, SI_RING_GSVS));
 
        args[0] = t_list;
        args[1] = lp_build_mul_imm(uint,
@@ -2357,8 +2414,11 @@ static int si_generate_gs_copy_shader(struct si_context *sctx,
                struct si_shader_output *out = gs->output + i;
                unsigned chan;
 
+               shader->output[i] = *out;
+
                outputs[i].name = out->name;
                outputs[i].index = out->index;
+               outputs[i].sid = out->sid;
                outputs[i].usage = out->usage;
 
                for (chan = 0; chan < 4; chan++) {
@@ -2376,6 +2436,7 @@ static int si_generate_gs_copy_shader(struct si_context *sctx,
                                                 base->elem_type, "");
                }
        }
+       shader->noutput = gs->noutput;
 
        si_llvm_export_vs(bld_base, outputs, gs->noutput);
 
@@ -2404,7 +2465,14 @@ int si_pipe_shader_create(
        struct lp_build_tgsi_context * bld_base;
        LLVMModuleRef mod;
        int r = 0;
-       bool dump = r600_can_dump_shader(&sctx->screen->b, shader->selector->tokens);
+       bool dump = r600_can_dump_shader(&sctx->screen->b, sel->tokens);
+
+       /* Dump TGSI code before doing TGSI->LLVM conversion in case the
+        * conversion fails. */
+       if (dump) {
+               tgsi_dump(sel->tokens, 0);
+               si_dump_streamout(&sel->so);
+       }
 
        assert(shader->shader.noutput == 0);
        assert(shader->shader.nparam == 0);
@@ -2492,13 +2560,6 @@ int si_pipe_shader_create(
        preload_samplers(&si_shader_ctx);
        preload_streamout_buffers(&si_shader_ctx);
 
-       /* Dump TGSI code before doing TGSI->LLVM conversion in case the
-        * conversion fails. */
-       if (dump) {
-               tgsi_dump(sel->tokens, 0);
-               si_dump_streamout(&sel->so);
-       }
-
        if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                si_shader_ctx.gs_next_vertex =
                        lp_build_alloca(bld_base->base.gallivm,
@@ -2524,6 +2585,7 @@ int si_pipe_shader_create(
        if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                shader->gs_copy_shader = CALLOC_STRUCT(si_pipe_shader);
                shader->gs_copy_shader->selector = shader->selector;
+               shader->gs_copy_shader->key = shader->key;
                si_shader_ctx.shader = shader->gs_copy_shader;
                if ((r = si_generate_gs_copy_shader(sctx, &si_shader_ctx, dump))) {
                        free(shader->gs_copy_shader);