+/* Send GS Alloc Req message from the first wave of the group to SPI.
+ * Message payload is:
+ * - bits 0..10: vertices in group
+ * - bits 12..22: primitives in group
+ */
+void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
+ LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
+{
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tmp;
+ bool export_dummy_prim = false;
+
+ /* HW workaround for a GPU hang with 100% culling.
+ * We always have to export at least 1 primitive.
+ * Export a degenerate triangle using vertex 0 for all 3 vertices.
+ */
+ if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
+ assert(vtx_cnt == ctx->i32_0);
+ prim_cnt = ctx->i32_1;
+ vtx_cnt = ctx->i32_1;
+ export_dummy_prim = true;
+ }
+
+ ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
+
+ tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false),"");
+ tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
+ ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
+
+ if (export_dummy_prim) {
+ struct ac_ngg_prim prim = {};
+ /* The vertex indices are 0,0,0. */
+ prim.passthrough = ctx->i32_0;
+
+ struct ac_export_args pos = {};
+ pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
+ pos.target = V_008DFC_SQ_EXP_POS;
+ pos.enabled_channels = 0xf;
+ pos.done = true;
+
+ ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx),
+ ctx->i32_0, ""), 5021);
+ ac_build_export_prim(ctx, &prim);
+ ac_build_export(ctx, &pos);
+ ac_build_endif(ctx, 5021);
+ }
+
+ ac_build_endif(ctx, 5020);
+}
+
+LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx,
+ const struct ac_ngg_prim *prim)
+{
+ /* The prim export format is:
+ * - bits 0..8: index 0
+ * - bit 9: edge flag 0
+ * - bits 10..18: index 1
+ * - bit 19: edge flag 1
+ * - bits 20..28: index 2
+ * - bit 29: edge flag 2
+ * - bit 31: null primitive (skip)
+ */
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
+ LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
+
+ for (unsigned i = 0; i < prim->num_vertices; ++i) {
+ tmp = LLVMBuildShl(builder, prim->index[i],
+ LLVMConstInt(ctx->i32, 10 * i, false), "");
+ result = LLVMBuildOr(builder, result, tmp, "");
+ tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
+ tmp = LLVMBuildShl(builder, tmp,
+ LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
+ result = LLVMBuildOr(builder, result, tmp, "");
+ }
+ return result;
+}
+
+void ac_build_export_prim(struct ac_llvm_context *ctx,
+ const struct ac_ngg_prim *prim)
+{
+ struct ac_export_args args;
+
+ if (prim->passthrough) {
+ args.out[0] = prim->passthrough;
+ } else {
+ args.out[0] = ac_pack_prim_export(ctx, prim);
+ }
+
+ args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
+ args.out[1] = LLVMGetUndef(ctx->f32);
+ args.out[2] = LLVMGetUndef(ctx->f32);
+ args.out[3] = LLVMGetUndef(ctx->f32);
+
+ args.target = V_008DFC_SQ_EXP_PRIM;
+ args.enabled_channels = 1;
+ args.done = true;
+ args.valid_mask = false;
+ args.compr = false;
+
+ ac_build_export(ctx, &args);
+}
+
+static LLVMTypeRef
+arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
+{
+ if (type == AC_ARG_FLOAT) {
+ return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
+ } else if (type == AC_ARG_INT) {
+ return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
+ } else {
+ LLVMTypeRef ptr_type;
+ switch (type) {
+ case AC_ARG_CONST_PTR:
+ ptr_type = ctx->i8;
+ break;
+ case AC_ARG_CONST_FLOAT_PTR:
+ ptr_type = ctx->f32;
+ break;
+ case AC_ARG_CONST_PTR_PTR:
+ ptr_type = ac_array_in_const32_addr_space(ctx->i8);
+ break;
+ case AC_ARG_CONST_DESC_PTR:
+ ptr_type = ctx->v4i32;
+ break;
+ case AC_ARG_CONST_IMAGE_PTR:
+ ptr_type = ctx->v8i32;
+ break;
+ default:
+ unreachable("unknown arg type");
+ }
+ if (size == 1) {
+ return ac_array_in_const32_addr_space(ptr_type);
+ } else {
+ assert(size == 2);
+ return ac_array_in_const_addr_space(ptr_type);
+ }
+ }
+}
+
+LLVMValueRef
+ac_build_main(const struct ac_shader_args *args,
+ struct ac_llvm_context *ctx,
+ enum ac_llvm_calling_convention convention,
+ const char *name, LLVMTypeRef ret_type,
+ LLVMModuleRef module)
+{
+ LLVMTypeRef arg_types[AC_MAX_ARGS];
+
+ for (unsigned i = 0; i < args->arg_count; i++) {
+ arg_types[i] = arg_llvm_type(args->args[i].type,
+ args->args[i].size, ctx);
+ }
+
+ LLVMTypeRef main_function_type =
+ LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
+
+ LLVMValueRef main_function =
+ LLVMAddFunction(module, name, main_function_type);
+ LLVMBasicBlockRef main_function_body =
+ LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
+ LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
+
+ LLVMSetFunctionCallConv(main_function, convention);
+ for (unsigned i = 0; i < args->arg_count; ++i) {
+ LLVMValueRef P = LLVMGetParam(main_function, i);
+
+ if (args->args[i].file != AC_ARG_SGPR)
+ continue;
+
+ ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
+
+ if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
+ ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
+ ac_add_attr_dereferenceable(P, UINT64_MAX);
+ ac_add_attr_alignment(P, 32);
+ }
+ }
+
+ ctx->main_function = main_function;
+
+ if (LLVM_VERSION_MAJOR >= 11) {
+ /* Enable denormals for FP16 and FP64: */
+ LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math",
+ "ieee,ieee");
+ /* Disable denormals for FP32: */
+ LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
+ "preserve-sign,preserve-sign");
+ }
+ return main_function;
+}
+
+void ac_build_s_endpgm(struct ac_llvm_context *ctx)
+{
+ LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+ LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
+ LLVMBuildCall(ctx->builder, code, NULL, 0, "");
+}
+
+LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx,
+ LLVMValueRef mask, LLVMValueRef index)
+{
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMTypeRef type = LLVMTypeOf(mask);
+
+ LLVMValueRef bit = LLVMBuildShl(builder, LLVMConstInt(type, 1, 0),
+ LLVMBuildZExt(builder, index, type, ""), "");
+ LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
+ LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
+ return ac_build_bit_count(ctx, prefix_mask);
+}
+
+/* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
+LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx,
+ LLVMValueRef mask[2], LLVMValueRef index)
+{
+ LLVMBuilderRef builder = ctx->builder;
+#if 0
+ /* Reference version using i128. */
+ LLVMValueRef input_mask =
+ LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
+
+ return ac_prefix_bitcount(ctx, input_mask, index);
+#else
+ /* Optimized version using 2 64-bit masks. */
+ LLVMValueRef is_hi, is_0, c64, c128, all_bits;
+ LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
+
+ /* Compute the 128-bit prefix mask. */
+ c64 = LLVMConstInt(ctx->i32, 64, 0);
+ c128 = LLVMConstInt(ctx->i32, 128, 0);
+ all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
+ /* The first index that can have non-zero high bits in the prefix mask is 65. */
+ is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
+ is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
+ mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
+
+ for (unsigned i = 0; i < 2; i++) {
+ shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
+ /* For i==0, index==0, the right shift by 64 doesn't give the desired result,
+ * so we handle it by the is_0 select.
+ * For i==1, index==64, same story, so we handle it by the last is_hi select.
+ * For i==0, index==64, we shift by 0, which is what we want.
+ */
+ prefix_mask[i] = LLVMBuildLShr(builder, all_bits,
+ LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
+ prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
+ prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
+ }
+
+ prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
+ prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
+ prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
+
+ return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
+#endif
+}
+
+/**
+ * Convert triangle strip indices to triangle indices. This is used to decompose
+ * triangle strips into triangles.
+ */
+void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx,
+ LLVMValueRef is_odd,
+ LLVMValueRef flatshade_first,
+ LLVMValueRef index[3])
+{
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef out[3];
+
+ /* We need to change the vertex order for odd triangles to get correct
+ * front/back facing by swapping 2 vertex indices, but we also have to
+ * keep the provoking vertex in the same place.
+ *
+ * If the first vertex is provoking, swap index 1 and 2.
+ * If the last vertex is provoking, swap index 0 and 1.
+ */
+ out[0] = LLVMBuildSelect(builder, flatshade_first,
+ index[0],
+ LLVMBuildSelect(builder, is_odd,
+ index[1], index[0], ""), "");
+ out[1] = LLVMBuildSelect(builder, flatshade_first,
+ LLVMBuildSelect(builder, is_odd,
+ index[2], index[1], ""),
+ LLVMBuildSelect(builder, is_odd,
+ index[0], index[1], ""), "");
+ out[2] = LLVMBuildSelect(builder, flatshade_first,
+ LLVMBuildSelect(builder, is_odd,
+ index[1], index[2], ""),
+ index[2], "");
+ memcpy(index, out, sizeof(out));
+}