X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcommon%2Fac_nir_to_llvm.c;h=550c12d249c7a885b3fe1bdc9b143d58e8acf2a1;hb=bb750d265c5c27136cf10460fc423503a4d7d5aa;hp=6ab93b367831e3d1891c8e269f792b9a4b414c7d;hpb=30c1a93f6de66f0b9f86e11b517b62f57f330d95;p=mesa.git diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 6ab93b36783..550c12d249c 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -43,9 +43,6 @@ enum radeon_llvm_calling_convention { RADEON_LLVM_AMDGPU_HS = 93, }; -#define CONST_ADDR_SPACE 2 -#define LOCAL_ADDR_SPACE 3 - #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1) #define RADEON_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1) @@ -93,9 +90,6 @@ struct nir_to_llvm_context { LLVMValueRef ring_offsets; LLVMValueRef push_constants; LLVMValueRef view_index; - LLVMValueRef num_work_groups; - LLVMValueRef workgroup_ids[3]; - LLVMValueRef local_invocation_ids; LLVMValueRef tg_size; LLVMValueRef vertex_buffers; @@ -126,7 +120,6 @@ struct nir_to_llvm_context { LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; - LLVMValueRef prim_mask; LLVMValueRef sample_pos_offset; LLVMValueRef persp_sample, persp_center, persp_centroid; LLVMValueRef linear_sample, linear_center, linear_centroid; @@ -159,28 +152,6 @@ nir_to_llvm_context_from_abi(struct ac_shader_abi *abi) return container_of(abi, ctx, abi); } -static LLVMTypeRef -nir2llvmtype(struct ac_nir_context *ctx, - const struct glsl_type *type) -{ - switch (glsl_get_base_type(glsl_without_array(type))) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - return ctx->ac.i32; - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - return ctx->ac.i64; - case GLSL_TYPE_DOUBLE: - return ctx->ac.f64; - case GLSL_TYPE_FLOAT: - return ctx->ac.f32; - default: - assert(!"Unsupported type in nir2llvmtype()"); - break; - } - return 0; -} - static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, const nir_deref_var *deref, enum ac_descriptor_type desc_type, @@ -323,14 +294,13 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); for (unsigned i = 0; i < args->sgpr_count; ++i) { + ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); + if (args->array_params_mask & (1 << i)) { LLVMValueRef P = LLVMGetParam(main_function, i); - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_BYVAL); + ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); ac_add_attr_dereferenceable(P, UINT64_MAX); } - else { - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); - } } if (max_workgroup_size) { @@ -352,16 +322,13 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, LLVMAddTargetDependentFunctionAttr(main_function, "unsafe-fp-math", "true"); + LLVMAddTargetDependentFunctionAttr(main_function, + "no-signed-zeros-fp-math", + "true"); } return main_function; } -static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) -{ - return LLVMPointerType(LLVMArrayType(elem_type, num_elements), - CONST_ADDR_SPACE); -} - static int get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) { if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) @@ -540,8 +507,33 @@ struct user_sgpr_info { bool indirect_all_descriptor_sets; }; +static bool needs_view_index_sgpr(struct nir_to_llvm_context *ctx, + gl_shader_stage stage) +{ + switch (stage) { + case MESA_SHADER_VERTEX: + if (ctx->shader_info->info.needs_multiview_view_index || + (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index)) + return true; + break; + case MESA_SHADER_TESS_EVAL: + if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index)) + return true; + break; + case MESA_SHADER_GEOMETRY: + case MESA_SHADER_TESS_CTRL: + if (ctx->shader_info->info.needs_multiview_view_index) + return true; + break; + default: + break; + } + return false; +} + static void allocate_user_sgprs(struct nir_to_llvm_context *ctx, gl_shader_stage stage, + bool needs_view_index, struct user_sgpr_info *user_sgpr_info) { memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info)); @@ -597,6 +589,9 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx, break; } + if (needs_view_index) + user_sgpr_info->sgpr_count++; + if (ctx->shader_info->info.loads_push_constants) user_sgpr_info->sgpr_count += 2; @@ -620,7 +615,7 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx, struct arg_info *args, LLVMValueRef *desc_sets) { - LLVMTypeRef type = const_array(ctx->ac.i8, 1024 * 1024); + LLVMTypeRef type = ac_array_in_const_addr_space(ctx->ac.i8); unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0; unsigned stage_mask = 1 << stage; @@ -637,7 +632,7 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx, } } } else { - add_array_arg(args, const_array(type, 32), desc_sets); + add_array_arg(args, ac_array_in_const_addr_space(type), desc_sets); } if (ctx->shader_info->info.loads_push_constants) { @@ -657,7 +652,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx, (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { if (ctx->shader_info->info.vs.has_vertex_buffers) { - add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16), + add_arg(args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32), &ctx->vertex_buffers); } add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex); @@ -768,11 +763,11 @@ static void create_function(struct nir_to_llvm_context *ctx, struct user_sgpr_info user_sgpr_info; struct arg_info args = {}; LLVMValueRef desc_sets; - - allocate_user_sgprs(ctx, stage, &user_sgpr_info); + bool needs_view_index = needs_view_index_sgpr(ctx, stage); + allocate_user_sgprs(ctx, stage, needs_view_index, &user_sgpr_info); if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) { - add_arg(&args, ARG_SGPR, const_array(ctx->ac.v4i32, 16), + add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32), &ctx->ring_offsets); } @@ -784,21 +779,21 @@ static void create_function(struct nir_to_llvm_context *ctx, if (ctx->shader_info->info.cs.uses_grid_size) { add_arg(&args, ARG_SGPR, ctx->ac.v3i32, - &ctx->num_work_groups); + &ctx->abi.num_work_groups); } for (int i = 0; i < 3; i++) { - ctx->workgroup_ids[i] = NULL; + ctx->abi.workgroup_ids[i] = NULL; if (ctx->shader_info->info.cs.uses_block_id[i]) { add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->workgroup_ids[i]); + &ctx->abi.workgroup_ids[i]); } } if (ctx->shader_info->info.cs.uses_local_invocation_idx) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tg_size); add_arg(&args, ARG_VGPR, ctx->ac.v3i32, - &ctx->local_invocation_ids); + &ctx->abi.local_invocation_ids); break; case MESA_SHADER_VERTEX: declare_global_input_sgprs(ctx, stage, has_previous_stage, @@ -807,7 +802,7 @@ static void create_function(struct nir_to_llvm_context *ctx, declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, previous_stage, &args); - if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index)) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); if (ctx->options->key.vs.as_es) add_arg(&args, ARG_SGPR, ctx->ac.i32, @@ -851,7 +846,7 @@ static void create_function(struct nir_to_llvm_context *ctx, &ctx->tcs_out_layout); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tcs_in_layout); - if (ctx->shader_info->info.needs_multiview_view_index) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); @@ -876,7 +871,7 @@ static void create_function(struct nir_to_llvm_context *ctx, &ctx->tcs_out_layout); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tcs_in_layout); - if (ctx->shader_info->info.needs_multiview_view_index) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); @@ -895,7 +890,7 @@ static void create_function(struct nir_to_llvm_context *ctx, &args, &desc_sets); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tcs_offchip_layout); - if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index)) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); if (ctx->options->key.tes.as_es) { @@ -942,7 +937,7 @@ static void create_function(struct nir_to_llvm_context *ctx, &ctx->gsvs_ring_stride); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gsvs_num_entries); - if (ctx->shader_info->info.needs_multiview_view_index) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); @@ -973,7 +968,7 @@ static void create_function(struct nir_to_llvm_context *ctx, &ctx->gsvs_ring_stride); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gsvs_num_entries); - if (ctx->shader_info->info.needs_multiview_view_index) + if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index); @@ -1006,7 +1001,7 @@ static void create_function(struct nir_to_llvm_context *ctx, add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->sample_pos_offset); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->prim_mask); + add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample); add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center); add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid); @@ -1052,10 +1047,10 @@ static void create_function(struct nir_to_llvm_context *ctx, &user_sgpr_idx, 2); if (ctx->options->supports_spill) { ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, CONST_ADDR_SPACE), + LLVMPointerType(ctx->ac.i8, AC_CONST_ADDR_SPACE), NULL, 0, AC_FUNC_ATTR_READNONE); ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets, - const_array(ctx->ac.v4i32, 16), ""); + ac_array_in_const_addr_space(ctx->ac.v4i32), ""); } } @@ -1362,27 +1357,47 @@ static LLVMValueRef emit_fsign(struct ac_llvm_context *ctx, } static LLVMValueRef emit_isign(struct ac_llvm_context *ctx, - LLVMValueRef src0) + LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef cmp, val; + LLVMValueRef cmp, val, zero, one; + LLVMTypeRef type; - cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32_0, ""); - val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32_1, src0, ""); - cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32_0, ""); - val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(ctx->i32, -1, true), ""); + if (bitsize == 32) { + type = ctx->i32; + zero = ctx->i32_0; + one = ctx->i32_1; + } else { + type = ctx->i64; + zero = ctx->i64_0; + one = ctx->i64_1; + } + + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), ""); return val; } static LLVMValueRef emit_ffract(struct ac_llvm_context *ctx, - LLVMValueRef src0) + LLVMValueRef src0, unsigned bitsize) { - const char *intr = "llvm.floor.f32"; + LLVMTypeRef type; + char *intr; + + if (bitsize == 32) { + intr = "llvm.floor.f32"; + type = ctx->f32; + } else { + intr = "llvm.floor.f64"; + type = ctx->f64; + } + LLVMValueRef fsrc0 = ac_to_float(ctx, src0); LLVMValueRef params[] = { fsrc0, }; - LLVMValueRef floor = ac_build_intrinsic(ctx, intr, - ctx->f32, params, 1, + LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); return LLVMBuildFSub(ctx->builder, fsrc0, floor, ""); } @@ -1422,9 +1437,15 @@ static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, } static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, - LLVMValueRef src0) + LLVMValueRef src0, + unsigned bitsize) { - return LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); + LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); + + if (bitsize == 32) + return result; + + return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); } static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, @@ -1720,11 +1741,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) src[1] = ac_to_float(&ctx->ac, src[1]); result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], ""); break; - case nir_op_fdiv: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = ac_build_fdiv(&ctx->ac, src[0], src[1]); - break; case nir_op_frcp: src[0] = ac_to_float(&ctx->ac, src[0]); result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1, @@ -1807,7 +1823,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = emit_minmax_int(&ctx->ac, LLVMIntULT, src[0], src[1]); break; case nir_op_isign: - result = emit_isign(&ctx->ac, src[0]); + result = emit_isign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); break; case nir_op_fsign: src[0] = ac_to_float(&ctx->ac, src[0]); @@ -1830,7 +1846,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) ac_to_float_type(&ctx->ac, def_type),src[0]); break; case nir_op_ffract: - result = emit_ffract(&ctx->ac, src[0]); + result = emit_ffract(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); break; case nir_op_fsin: result = emit_intrin_1f_param(&ctx->ac, "llvm.sin", @@ -1865,18 +1881,24 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) case nir_op_fmax: result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (instr->dest.dest.ssa.bit_size == 32) + if (ctx->ac.chip_class < GFX9 && + instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize", ac_to_float_type(&ctx->ac, def_type), result); + } break; case nir_op_fmin: result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (instr->dest.dest.ssa.bit_size == 32) + if (ctx->ac.chip_class < GFX9 && + instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize", ac_to_float_type(&ctx->ac, def_type), result); + } break; case nir_op_ffma: result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd", @@ -1979,7 +2001,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = emit_f2b(&ctx->ac, src[0]); break; case nir_op_b2i: - result = emit_b2i(&ctx->ac, src[0]); + result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); break; case nir_op_i2b: src[0] = ac_to_integer(&ctx->ac, src[0]); @@ -2261,11 +2283,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, struct ac_image_args *args) { if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + return ac_build_buffer_load_format(&ctx->ac, args->resource, args->addr, ctx->ac.i32_0, - true); + util_last_bit(mask), + false, true); } args->opcode = ac_image_sample; @@ -2384,10 +2409,50 @@ static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx, static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef ptr = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[0]); + + return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false); +} + +static uint32_t widen_mask(uint32_t mask, unsigned multiplier) +{ + uint32_t new_mask = 0; + for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + if (mask & (1u << i)) + new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); + return new_mask; +} + +static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned start, unsigned count) +{ + LLVMTypeRef type = LLVMTypeOf(src); + + if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) { + assert(start == 0); + assert(count == 1); + return src; + } + + unsigned src_elements = LLVMGetVectorSize(type); + assert(start < src_elements); + assert(start + count <= src_elements); - return get_buffer_size(ctx, LLVMBuildLoad(ctx->ac.builder, ptr, ""), false); + if (start == 0 && count == src_elements) + return src; + + if (count == 1) + return LLVMBuildExtractElement(ctx->builder, src, LLVMConstInt(ctx->i32, start, false), ""); + + assert(count <= 8); + LLVMValueRef indices[8]; + for (unsigned i = 0; i < count; ++i) + indices[i] = LLVMConstInt(ctx->i32, start + i, false); + + LLVMValueRef swizzle = LLVMConstVector(indices, count); + return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, ""); } + static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { @@ -2409,6 +2474,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, if (components_32bit > 1) data_type = LLVMVectorType(ctx->ac.f32, components_32bit); + writemask = widen_mask(writemask, elem_size_mult); + base_data = ac_to_float(&ctx->ac, src_data); base_data = trim_vector(&ctx->ac, base_data, instr->num_components); base_data = LLVMBuildBitCast(ctx->ac.builder, base_data, @@ -2418,7 +2485,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, int start, count; LLVMValueRef data; LLVMValueRef offset; - LLVMValueRef tmp; + u_bit_scan_consecutive_range(&writemask, &start, &count); /* Due to an LLVM limitation, split 3-element writes @@ -2428,9 +2495,6 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, count = 2; } - start *= elem_size_mult; - count *= elem_size_mult; - if (count > 4) { writemask |= ((1u << (count - 4)) - 1u) << (start + 4); count = 4; @@ -2438,28 +2502,14 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, if (count == 4) { store_name = "llvm.amdgcn.buffer.store.v4f32"; - data = base_data; } else if (count == 2) { - tmp = LLVMBuildExtractElement(ctx->ac.builder, - base_data, LLVMConstInt(ctx->ac.i32, start, false), ""); - data = LLVMBuildInsertElement(ctx->ac.builder, LLVMGetUndef(ctx->ac.v2f32), tmp, - ctx->ac.i32_0, ""); - - tmp = LLVMBuildExtractElement(ctx->ac.builder, - base_data, LLVMConstInt(ctx->ac.i32, start + 1, false), ""); - data = LLVMBuildInsertElement(ctx->ac.builder, data, tmp, - ctx->ac.i32_1, ""); store_name = "llvm.amdgcn.buffer.store.v2f32"; } else { assert(count == 1); - if (ac_get_llvm_num_components(base_data) > 1) - data = LLVMBuildExtractElement(ctx->ac.builder, base_data, - LLVMConstInt(ctx->ac.i32, start, false), ""); - else - data = base_data; store_name = "llvm.amdgcn.buffer.store.f32"; } + data = extract_vector_range(&ctx->ac, base_data, start, count); offset = base_offset; if (start != 0) { @@ -2568,8 +2618,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, ctx->ac.i1false, }; - results[i] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0); - + results[i > 0 ? 1 : 0] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0); } assume(results[0]); @@ -2607,7 +2656,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL, 0, false, false, true, true); - + ret = trim_vector(&ctx->ac, ret, num_components); return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); } @@ -2821,72 +2870,45 @@ get_dw_address(struct nir_to_llvm_context *ctx, } static LLVMValueRef -load_tcs_input(struct ac_shader_abi *abi, - LLVMValueRef vertex_index, - LLVMValueRef indir_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact) +load_tcs_varyings(struct ac_shader_abi *abi, + LLVMValueRef vertex_index, + LLVMValueRef indir_index, + unsigned const_index, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + bool is_patch, + bool is_compact, + bool load_input) { struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); LLVMValueRef dw_addr, stride; LLVMValueRef value[4], result; unsigned param = shader_io_get_unique_index(location); - stride = unpack_param(&ctx->ac, ctx->tcs_in_layout, 13, 8); - dw_addr = get_tcs_in_current_patch_offset(ctx); - dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride, - indir_index); - - for (unsigned i = 0; i < num_components + component; i++) { - value[i] = ac_lds_load(&ctx->ac, dw_addr); - dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, - ctx->ac.i32_1, ""); - } - result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component); - return result; -} - -static LLVMValueRef -load_tcs_output(struct nir_to_llvm_context *ctx, - nir_intrinsic_instr *instr) -{ - LLVMValueRef dw_addr; - LLVMValueRef stride = NULL; - LLVMValueRef value[4], result; - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - unsigned param; - const bool per_vertex = nir_is_per_vertex_io(instr->variables[0]->var, ctx->stage); - const bool is_compact = instr->variables[0]->var->data.compact; - param = shader_io_get_unique_index(instr->variables[0]->var->data.location); - get_deref_offset(ctx->nir, instr->variables[0], - false, NULL, per_vertex ? &vertex_index : NULL, - &const_index, &indir_index); - - if (!instr->variables[0]->var->data.patch) { - stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8); - dw_addr = get_tcs_out_current_patch_offset(ctx); + if (load_input) { + stride = unpack_param(&ctx->ac, ctx->tcs_in_layout, 13, 8); + dw_addr = get_tcs_in_current_patch_offset(ctx); } else { - dw_addr = get_tcs_out_current_patch_data_offset(ctx); + if (!is_patch) { + stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(ctx); + } else { + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + stride = NULL; + } } dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride, indir_index); - unsigned comp = instr->variables[0]->var->data.location_frac; - for (unsigned i = comp; i < instr->num_components + comp; i++) { + for (unsigned i = 0; i < num_components + component; i++) { value[i] = ac_lds_load(&ctx->ac, dw_addr); dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, ctx->ac.i32_1, ""); } - result = ac_build_varying_gather_values(&ctx->ac, value, instr->num_components, comp); - result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, &instr->dest.ssa), ""); + result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component); return result; } @@ -2950,16 +2972,17 @@ store_tcs_output(struct ac_shader_abi *abi, continue; LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); - if (store_lds || is_tess_factor) - ac_lds_store(&ctx->ac, dw_addr, value); + if (store_lds || is_tess_factor) { + LLVMValueRef dw_addr_chan = + LLVMBuildAdd(ctx->builder, dw_addr, + LLVMConstInt(ctx->ac.i32, chan, false), ""); + ac_lds_store(&ctx->ac, dw_addr_chan, value); + } if (!is_tess_factor && writemask != 0xF) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1, buf_addr, ctx->oc_lds, 4 * (base + chan), 1, 0, true, false); - - dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, - ctx->ac.i32_1, ""); } if (writemask == 0xF) { @@ -2979,7 +3002,8 @@ load_tes_input(struct ac_shader_abi *abi, unsigned component, unsigned num_components, bool is_patch, - bool is_compact) + bool is_compact, + bool load_input) { struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); LLVMValueRef buf_addr; @@ -3015,7 +3039,6 @@ load_gs_input(struct ac_shader_abi *abi, { struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); LLVMValueRef vtx_offset; - LLVMValueRef args[9]; unsigned param, vtx_offset_param; LLVMValueRef value[4], result; @@ -3033,24 +3056,23 @@ load_gs_input(struct ac_shader_abi *abi, LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), ""); value[i] = ac_lds_load(&ctx->ac, dw_addr); } else { - args[0] = ctx->esgs_ring; - args[1] = vtx_offset; - args[2] = LLVMConstInt(ctx->ac.i32, (param * 4 + i + const_index) * 256, false); - args[3] = ctx->ac.i32_0; - args[4] = ctx->ac.i32_1; /* OFFEN */ - args[5] = ctx->ac.i32_0; /* IDXEN */ - args[6] = ctx->ac.i32_1; /* GLC */ - args[7] = ctx->ac.i32_0; /* SLC */ - args[8] = ctx->ac.i32_0; /* TFE */ - - value[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.buffer.load.dword.i32.i32", - ctx->ac.i32, args, 9, - AC_FUNC_ATTR_READONLY | - AC_FUNC_ATTR_LEGACY); + LLVMValueRef soffset = + LLVMConstInt(ctx->ac.i32, + (param * 4 + i + const_index) * 256, + false); + + value[i] = ac_build_buffer_load(&ctx->ac, + ctx->esgs_ring, 1, + ctx->ac.i32_0, + vtx_offset, soffset, + 0, 1, 0, true, false); + + value[i] = LLVMBuildBitCast(ctx->builder, value[i], + type, ""); } } result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component); - + result = ac_to_integer(&ctx->ac, result); return result; } @@ -3093,6 +3115,31 @@ build_gep_for_deref(struct ac_nir_context *ctx, return val; } +static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr, + bool load_inputs) +{ + LLVMValueRef result; + LLVMValueRef vertex_index = NULL; + LLVMValueRef indir_index = NULL; + unsigned const_index = 0; + unsigned location = instr->variables[0]->var->data.location; + unsigned driver_location = instr->variables[0]->var->data.driver_location; + const bool is_patch = instr->variables[0]->var->data.patch; + const bool is_compact = instr->variables[0]->var->data.compact; + + get_deref_offset(ctx, instr->variables[0], + false, NULL, is_patch ? NULL : &vertex_index, + &const_index, &indir_index); + + result = ctx->abi->load_tess_varyings(ctx->abi, vertex_index, indir_index, + const_index, location, driver_location, + instr->variables[0]->var->data.location_frac, + instr->num_components, + is_patch, is_compact, load_inputs); + return LLVMBuildBitCast(ctx->ac.builder, result, get_def_type(ctx, &instr->dest.ssa), ""); +} + static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { @@ -3116,38 +3163,21 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, case nir_var_shader_in: if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) { - LLVMValueRef result; - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - unsigned location = instr->variables[0]->var->data.location; - unsigned driver_location = instr->variables[0]->var->data.driver_location; - const bool is_patch = instr->variables[0]->var->data.patch; - const bool is_compact = instr->variables[0]->var->data.compact; + return load_tess_varyings(ctx, instr, true); + } + if (ctx->stage == MESA_SHADER_GEOMETRY) { + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMValueRef indir_index; + unsigned const_index, vertex_index; get_deref_offset(ctx, instr->variables[0], - false, NULL, is_patch ? NULL : &vertex_index, + false, &vertex_index, NULL, &const_index, &indir_index); - result = ctx->abi->load_tess_inputs(ctx->abi, vertex_index, indir_index, - const_index, location, driver_location, - instr->variables[0]->var->data.location_frac, - instr->num_components, - is_patch, is_compact); - return LLVMBuildBitCast(ctx->ac.builder, result, get_def_type(ctx, &instr->dest.ssa), ""); - } - - if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMValueRef indir_index; - unsigned const_index, vertex_index; - get_deref_offset(ctx, instr->variables[0], - false, &vertex_index, NULL, - &const_index, &indir_index); return ctx->abi->load_inputs(ctx->abi, instr->variables[0]->var->data.location, instr->variables[0]->var->data.driver_location, instr->variables[0]->var->data.location_frac, ve, - vertex_index, const_index, - nir2llvmtype(ctx, instr->variables[0]->var->type)); + vertex_index, const_index, type); } for (unsigned chan = comp; chan < ve + comp; chan++) { @@ -3194,8 +3224,9 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, ""); } case nir_var_shader_out: - if (ctx->stage == MESA_SHADER_TESS_CTRL) - return load_tcs_output(ctx->nctx, instr); + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + return load_tess_varyings(ctx, instr, false); + } for (unsigned chan = comp; chan < ve + comp; chan++) { if (indir_index) { @@ -3238,17 +3269,12 @@ visit_store_var(struct ac_nir_context *ctx, NULL, NULL, &const_index, &indir_index); if (get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) { - int old_writemask = writemask; src = LLVMBuildBitCast(ctx->ac.builder, src, LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), ""); - writemask = 0; - for (unsigned chan = 0; chan < 4; chan++) { - if (old_writemask & (1 << chan)) - writemask |= 3u << (2 * chan); - } + writemask = widen_mask(writemask, 2); } switch (instr->variables[0]->var->data.mode) { @@ -3470,9 +3496,7 @@ static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx, static LLVMValueRef get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - const struct glsl_type *type = instr->variables[0]->var->type; - if(instr->variables[0]->deref.child) - type = instr->variables[0]->deref.child->type; + const struct glsl_type *type = glsl_without_array(instr->variables[0]->var->type); LLVMValueRef src0 = get_src(ctx, instr->src[0]); LLVMValueRef coords[4]; @@ -3574,23 +3598,31 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, type = instr->variables[0]->deref.child->type; type = glsl_without_array(type); - if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { - params[0] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, false); - params[1] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]), - ctx->ac.i32_0, ""); /* vindex */ - params[2] = ctx->ac.i32_0; /* voffset */ - params[3] = ctx->ac.i1false; /* glc */ - params[4] = ctx->ac.i1false; /* slc */ - res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.load.format.v4f32", ctx->ac.v4f32, - params, 5, 0); + + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + if (dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_channels = util_last_bit(mask); + LLVMValueRef rsrc, vindex; + + rsrc = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, false); + vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]), + ctx->ac.i32_0, ""); + + /* TODO: set "glc" and "can_speculate" when OpenGL needs it. */ + res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, + ctx->ac.i32_0, num_channels, + false, false); + res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels); res = trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); res = ac_to_integer(&ctx->ac, res); } else { bool is_da = glsl_sampler_type_is_array(type) || - glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE || - glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS || - glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS_MS; + dim == GLSL_SAMPLER_DIM_CUBE || + dim == GLSL_SAMPLER_DIM_3D || + dim == GLSL_SAMPLER_DIM_SUBPASS || + dim == GLSL_SAMPLER_DIM_SUBPASS_MS; LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false; LLVMValueRef glc = ctx->ac.i1false; LLVMValueRef slc = ctx->ac.i1false; @@ -3598,18 +3630,10 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, params[0] = get_image_coords(ctx, instr); params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false); params[2] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */ - if (HAVE_LLVM <= 0x0309) { - params[3] = ctx->ac.i1false; /* r128 */ - params[4] = da; - params[5] = glc; - params[6] = slc; - } else { - LLVMValueRef lwe = ctx->ac.i1false; - params[3] = glc; - params[4] = slc; - params[5] = lwe; - params[6] = da; - } + params[3] = glc; + params[4] = slc; + params[5] = ctx->ac.i1false; + params[6] = da; ac_get_image_intr_name("llvm.amdgcn.image.load", ctx->ac.v4f32, /* vdata */ @@ -3630,12 +3654,13 @@ static void visit_image_store(struct ac_nir_context *ctx, char intrinsic_name[64]; const nir_variable *var = instr->variables[0]->var; const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); LLVMValueRef glc = ctx->ac.i1false; bool force_glc = ctx->ac.chip_class == SI; if (force_glc) glc = ctx->ac.i1true; - if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + if (dim == GLSL_SAMPLER_DIM_BUF) { params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */ params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, true); params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]), @@ -3647,7 +3672,8 @@ static void visit_image_store(struct ac_nir_context *ctx, params, 6, 0); } else { bool is_da = glsl_sampler_type_is_array(type) || - glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE; + dim == GLSL_SAMPLER_DIM_CUBE || + dim == GLSL_SAMPLER_DIM_3D; LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false; LLVMValueRef slc = ctx->ac.i1false; @@ -3655,18 +3681,10 @@ static void visit_image_store(struct ac_nir_context *ctx, params[1] = get_image_coords(ctx, instr); /* coords */ params[2] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, true); params[3] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */ - if (HAVE_LLVM <= 0x0309) { - params[4] = ctx->ac.i1false; /* r128 */ - params[5] = da; - params[6] = glc; - params[7] = slc; - } else { - LLVMValueRef lwe = ctx->ac.i1false; - params[4] = glc; - params[5] = slc; - params[6] = lwe; - params[7] = da; - } + params[4] = glc; + params[5] = slc; + params[6] = ctx->ac.i1false; + params[7] = da; ac_get_image_intr_name("llvm.amdgcn.image.store", LLVMTypeOf(params[0]), /* vdata */ @@ -3768,7 +3786,8 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, const nir_variable *var = instr->variables[0]->var; const struct glsl_type *type = instr->variables[0]->var->type; bool da = glsl_sampler_type_is_array(var->type) || - glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE; + glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE || + glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_3D; if(instr->variables[0]->deref.child) type = instr->variables[0]->deref.child->type; @@ -3850,17 +3869,34 @@ static void emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage) ac->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); } -static void emit_discard_if(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) +static void emit_discard(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr) { LLVMValueRef cond; - cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - get_src(ctx, instr->src[0]), - ctx->ac.i32_0, ""); + if (instr->intrinsic == nir_intrinsic_discard_if) { + cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + get_src(ctx, instr->src[0]), + ctx->ac.i32_0, ""); + } else { + assert(instr->intrinsic == nir_intrinsic_discard); + cond = LLVMConstInt(ctx->ac.i1, false, 0); + } + ac_build_kill_if_false(&ctx->ac, cond); } +static LLVMValueRef +visit_load_helper_invocation(struct ac_nir_context *ctx) +{ + LLVMValueRef result = ac_build_intrinsic(&ctx->ac, + "llvm.amdgcn.ps.live", + ctx->ac.i1, NULL, 0, + AC_FUNC_ATTR_READNONE); + result = LLVMBuildNot(ctx->ac.builder, result, ""); + return LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, ""); +} + static LLVMValueRef visit_load_local_invocation_index(struct nir_to_llvm_context *ctx) { @@ -3927,13 +3963,11 @@ static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx, return result; } -#define INTERP_CENTER 0 -#define INTERP_CENTROID 1 -#define INTERP_SAMPLE 2 - -static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx, +static LLVMValueRef lookup_interp_param(struct ac_shader_abi *abi, enum glsl_interp_mode interp, unsigned location) { + struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + switch (interp) { case INTERP_MODE_FLAT: default: @@ -3959,14 +3993,16 @@ static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx, return NULL; } -static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx, +static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) { + struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + LLVMValueRef result; LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false)); ptr = LLVMBuildBitCast(ctx->builder, ptr, - const_array(ctx->ac.v2f32, 64), ""); + ac_array_in_const_addr_space(ctx->ac.v2f32), ""); sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, ""); result = ac_build_load_invariant(&ctx->ac, ptr, sample_id); @@ -3978,12 +4014,36 @@ static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx) { LLVMValueRef values[2]; - values[0] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[0]); - values[1] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[1]); + values[0] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[0], 32); + values[1] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[1], 32); return ac_build_gather_values(&ctx->ac, values, 2); } -static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx, +static LLVMValueRef load_sample_mask_in(struct ac_nir_context *ctx) +{ + uint8_t log2_ps_iter_samples = ctx->nctx->shader_info->info.ps.force_persample ? ctx->nctx->options->key.fs.log2_num_samples : ctx->nctx->options->key.fs.log2_ps_iter_samples; + + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const uint16_t ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, + 0x1111, + 0x0101, + 0x0001, + }; + assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks)); + + uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples]; + + LLVMValueRef result, sample_id; + sample_id = unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4); + sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, ""); + result = LLVMBuildAnd(ctx->ac.builder, sample_id, ctx->abi->sample_coverage, ""); + return result; +} + +static LLVMValueRef visit_interp(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { LLVMValueRef result[4]; @@ -4001,33 +4061,33 @@ static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx, case nir_intrinsic_interp_var_at_sample: case nir_intrinsic_interp_var_at_offset: location = INTERP_CENTER; - src0 = get_src(ctx->nir, instr->src[0]); + src0 = get_src(ctx, instr->src[0]); break; default: break; } if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) { - src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->ac.i32_0, "")); - src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->ac.i32_1, "")); + src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_0, "")); + src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_1, "")); } else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) { LLVMValueRef sample_position; LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f); /* fetch sample ID */ - sample_position = load_sample_position(ctx, src0); + sample_position = ctx->abi->load_sample_position(ctx->abi, src0); - src_c0 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->ac.i32_0, ""); - src_c0 = LLVMBuildFSub(ctx->builder, src_c0, halfval, ""); - src_c1 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->ac.i32_1, ""); - src_c1 = LLVMBuildFSub(ctx->builder, src_c1, halfval, ""); + src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_position, ctx->ac.i32_0, ""); + src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, ""); + src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_position, ctx->ac.i32_1, ""); + src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, ""); } - interp_param = lookup_interp_param(ctx, instr->variables[0]->var->data.interpolation, location); + interp_param = ctx->abi->lookup_interp_param(ctx->abi, instr->variables[0]->var->data.interpolation, location); attr_number = LLVMConstInt(ctx->ac.i32, input_index, false); if (location == INTERP_CENTER) { LLVMValueRef ij_out[2]; - LLVMValueRef ddxy_out = emit_ddxy_interp(ctx->nir, interp_param); + LLVMValueRef ddxy_out = emit_ddxy_interp(ctx, interp_param); /* * take the I then J parameters, and the DDX/Y for it, and @@ -4040,24 +4100,24 @@ static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx, for (unsigned i = 0; i < 2; i++) { LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false); LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false); - LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->builder, + LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, ix_ll, ""); - LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->builder, + LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, iy_ll, ""); - LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->builder, + LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, ""); LLVMValueRef temp1, temp2; - interp_el = LLVMBuildBitCast(ctx->builder, interp_el, + interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, ctx->ac.f32, ""); - temp1 = LLVMBuildFMul(ctx->builder, ddx_el, src_c0, ""); - temp1 = LLVMBuildFAdd(ctx->builder, temp1, interp_el, ""); + temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, src_c0, ""); + temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, ""); - temp2 = LLVMBuildFMul(ctx->builder, ddy_el, src_c1, ""); - temp2 = LLVMBuildFAdd(ctx->builder, temp2, temp1, ""); + temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, src_c1, ""); + temp2 = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, ""); - ij_out[i] = LLVMBuildBitCast(ctx->builder, + ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, temp2, ctx->ac.i32, ""); } interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); @@ -4068,21 +4128,21 @@ static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx, LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); if (interp_param) { - interp_param = LLVMBuildBitCast(ctx->builder, + interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, ""); LLVMValueRef i = LLVMBuildExtractElement( - ctx->builder, interp_param, ctx->ac.i32_0, ""); + ctx->ac.builder, interp_param, ctx->ac.i32_0, ""); LLVMValueRef j = LLVMBuildExtractElement( - ctx->builder, interp_param, ctx->ac.i32_1, ""); + ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->prim_mask, i, j); + ctx->abi->prim_mask, i, j); } else { result[chan] = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false), llvm_chan, attr_number, - ctx->prim_mask); + ctx->abi->prim_mask); } } return ac_build_varying_gather_values(&ctx->ac, result, instr->num_components, @@ -4097,6 +4157,8 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr int idx; struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + assert(stream == 0); + /* Write vertex attribute values to GSVS ring */ gs_next_vertex = LLVMBuildLoad(ctx->builder, ctx->gs_next_vertex, @@ -4153,10 +4215,10 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr } static void -visit_end_primitive(struct nir_to_llvm_context *ctx, - const nir_intrinsic_instr *instr) +visit_end_primitive(struct ac_shader_abi *abi, unsigned stream) { - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id); + struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); } static LLVMValueRef @@ -4193,12 +4255,49 @@ static void visit_intrinsic(struct ac_nir_context *ctx, LLVMValueRef result = NULL; switch (instr->intrinsic) { + case nir_intrinsic_ballot: + result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); + break; + case nir_intrinsic_read_invocation: + case nir_intrinsic_read_first_invocation: { + LLVMValueRef args[2]; + + /* Value */ + args[0] = get_src(ctx, instr->src[0]); + + unsigned num_args; + const char *intr_name; + if (instr->intrinsic == nir_intrinsic_read_invocation) { + num_args = 2; + intr_name = "llvm.amdgcn.readlane"; + + /* Invocation */ + args[1] = get_src(ctx, instr->src[1]); + } else { + num_args = 1; + intr_name = "llvm.amdgcn.readfirstlane"; + } + + /* We currently have no other way to prevent LLVM from lifting the icmp + * calls to a dominating basic block. + */ + ac_build_optimization_barrier(&ctx->ac, &args[0]); + + result = ac_build_intrinsic(&ctx->ac, intr_name, + ctx->ac.i32, args, num_args, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + break; + } + case nir_intrinsic_load_subgroup_invocation: + result = ac_get_thread_id(&ctx->ac); + break; case nir_intrinsic_load_work_group_id: { LLVMValueRef values[3]; for (int i = 0; i < 3; i++) { - values[i] = ctx->nctx->workgroup_ids[i] ? - ctx->nctx->workgroup_ids[i] : ctx->ac.i32_0; + values[i] = ctx->abi->workgroup_ids[i] ? + ctx->abi->workgroup_ids[i] : ctx->ac.i32_0; } result = ac_build_gather_values(&ctx->ac, values, 3); @@ -4208,12 +4307,15 @@ static void visit_intrinsic(struct ac_nir_context *ctx, result = ctx->abi->base_vertex; break; } + case nir_intrinsic_load_local_group_size: + result = ctx->abi->load_local_group_size(ctx->abi); + break; case nir_intrinsic_load_vertex_id_zero_base: { result = ctx->abi->vertex_id; break; } case nir_intrinsic_load_local_invocation_id: { - result = ctx->nctx->local_invocation_ids; + result = ctx->abi->local_invocation_ids; break; } case nir_intrinsic_load_base_instance: @@ -4248,7 +4350,10 @@ static void visit_intrinsic(struct ac_nir_context *ctx, result = load_sample_pos(ctx); break; case nir_intrinsic_load_sample_mask_in: - result = ctx->abi->sample_coverage; + if (ctx->nctx) + result = load_sample_mask_in(ctx); + else + result = ctx->abi->sample_coverage; break; case nir_intrinsic_load_frag_coord: { LLVMValueRef values[4] = { @@ -4263,11 +4368,14 @@ static void visit_intrinsic(struct ac_nir_context *ctx, case nir_intrinsic_load_front_face: result = ctx->abi->front_face; break; + case nir_intrinsic_load_helper_invocation: + result = visit_load_helper_invocation(ctx); + break; case nir_intrinsic_load_instance_id: result = ctx->abi->instance_id; break; case nir_intrinsic_load_num_work_groups: - result = ctx->nctx->num_work_groups; + result = ctx->abi->num_work_groups; break; case nir_intrinsic_load_local_invocation_index: result = visit_load_local_invocation_index(ctx->nctx); @@ -4330,13 +4438,12 @@ static void visit_intrinsic(struct ac_nir_context *ctx, case nir_intrinsic_image_size: result = visit_image_size(ctx, instr); break; - case nir_intrinsic_discard: - ac_build_intrinsic(&ctx->ac, "llvm.AMDGPU.kilp", - LLVMVoidTypeInContext(ctx->ac.context), - NULL, 0, AC_FUNC_ATTR_LEGACY); + case nir_intrinsic_shader_clock: + result = ac_build_shader_clock(&ctx->ac); break; + case nir_intrinsic_discard: case nir_intrinsic_discard_if: - emit_discard_if(ctx, instr); + emit_discard(ctx, instr); break; case nir_intrinsic_memory_barrier: case nir_intrinsic_group_memory_barrier: @@ -4364,14 +4471,13 @@ static void visit_intrinsic(struct ac_nir_context *ctx, case nir_intrinsic_interp_var_at_centroid: case nir_intrinsic_interp_var_at_sample: case nir_intrinsic_interp_var_at_offset: - result = visit_interp(ctx->nctx, instr); + result = visit_interp(ctx, instr); break; case nir_intrinsic_emit_vertex: - assert(instr->const_index[0] == 0); - ctx->abi->emit_vertex(ctx->abi, 0, ctx->outputs); + ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->outputs); break; case nir_intrinsic_end_primitive: - visit_end_primitive(ctx->nctx, instr); + ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr)); break; case nir_intrinsic_load_tess_coord: { LLVMTypeRef type = ctx->nctx ? @@ -4389,6 +4495,21 @@ static void visit_intrinsic(struct ac_nir_context *ctx, case nir_intrinsic_load_patch_vertices_in: result = ctx->abi->load_patch_vertices_in(ctx->abi); break; + case nir_intrinsic_vote_all: { + LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_vote_any: { + LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_vote_eq: { + LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -4404,18 +4525,27 @@ static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write) { struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + LLVMValueRef result; + + LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); - if (write && ctx->stage == MESA_SHADER_FRAGMENT) - ctx->shader_info->fs.writes_memory = true; + result = LLVMBuildLoad(ctx->builder, buffer_ptr, ""); + LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md); - return LLVMBuildLoad(ctx->builder, buffer_ptr, ""); + return result; } static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr) { struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); + LLVMValueRef result; - return LLVMBuildLoad(ctx->builder, buffer_ptr, ""); + LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); + + result = LLVMBuildLoad(ctx->builder, buffer_ptr, ""); + LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md); + + return result; } static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, @@ -4438,9 +4568,6 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, assert(base_index < layout->binding_count); - if (write && ctx->stage == MESA_SHADER_FRAGMENT) - ctx->shader_info->fs.writes_memory = true; - switch (desc_type) { case AC_DESC_IMAGE: type = ctx->ac.v8i32; @@ -4492,7 +4619,7 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), ""); list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0)); - list = LLVMBuildPointerCast(builder, list, const_array(type, 0), ""); + list = LLVMBuildPointerCast(builder, list, ac_array_in_const_addr_space(type), ""); return ac_build_load_to_sgpr(&ctx->ac, list, index); } @@ -5026,12 +5153,13 @@ static void visit_ssa_undef(struct ac_nir_context *ctx, const nir_ssa_undef_instr *instr) { unsigned num_components = instr->def.num_components; + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); LLVMValueRef undef; if (num_components == 1) - undef = LLVMGetUndef(ctx->ac.i32); + undef = LLVMGetUndef(type); else { - undef = LLVMGetUndef(LLVMVectorType(ctx->ac.i32, num_components)); + undef = LLVMGetUndef(LLVMVectorType(type, num_components)); } _mesa_hash_table_insert(ctx->defs, &instr->def, undef); } @@ -5189,24 +5317,26 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx, int index = variable->data.location - VERT_ATTRIB_GENERIC0; int idx = variable->data.location; unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); + uint8_t input_usage_mask = + ctx->shader_info->info.vs.input_usage_mask[variable->data.location]; + unsigned num_channels = util_last_bit(input_usage_mask); variable->data.driver_location = idx * 4; - if (ctx->options->key.vs.instance_rate_inputs & (1u << index)) { - buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.instance_id, - ctx->abi.start_instance, ""); - if (ctx->options->key.vs.as_ls) { - ctx->shader_info->vs.vgpr_comp_cnt = - MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt); - } else { - ctx->shader_info->vs.vgpr_comp_cnt = - MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt); - } - } else - buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.vertex_id, - ctx->abi.base_vertex, ""); - for (unsigned i = 0; i < attrib_count; ++i, ++idx) { + if (ctx->options->key.vs.instance_rate_inputs & (1u << (index + i))) { + buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.instance_id, + ctx->abi.start_instance, ""); + if (ctx->options->key.vs.as_ls) { + ctx->shader_info->vs.vgpr_comp_cnt = + MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt); + } else { + ctx->shader_info->vs.vgpr_comp_cnt = + MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt); + } + } else + buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.vertex_id, + ctx->abi.base_vertex, ""); t_offset = LLVMConstInt(ctx->ac.i32, index + i, false); t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); @@ -5214,7 +5344,9 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx, input = ac_build_buffer_load_format(&ctx->ac, t_list, buffer_index, ctx->ac.i32_0, - true); + num_channels, false, true); + + input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels); for (unsigned chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); @@ -5297,7 +5429,7 @@ handle_fs_input_decl(struct nir_to_llvm_context *ctx, else interp_type = INTERP_CENTER; - interp = lookup_interp_param(ctx, variable->data.interpolation, interp_type); + interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); } else interp = NULL; @@ -5334,7 +5466,7 @@ prepare_interp_optimize(struct nir_to_llvm_context *ctx, } if (uses_center && uses_centroid) { - LLVMValueRef sel = LLVMBuildICmp(ctx->builder, LLVMIntSLT, ctx->prim_mask, ctx->ac.i32_0, ""); + LLVMValueRef sel = LLVMBuildICmp(ctx->builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, ""); ctx->persp_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->persp_center, ctx->persp_centroid, ""); ctx->linear_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->linear_center, ctx->linear_centroid, ""); } @@ -5365,7 +5497,7 @@ handle_fs_inputs(struct nir_to_llvm_context *ctx, if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { interp_param = *inputs; - interp_fs_input(ctx, index, interp_param, ctx->prim_mask, + interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, inputs); if (!interp_param) @@ -5601,32 +5733,11 @@ setup_shared(struct ac_nir_context *ctx, LLVMAddGlobalInAddressSpace( ctx->ac.module, glsl_to_llvm_type(ctx->nctx, variable->type), variable->name ? variable->name : "", - LOCAL_ADDR_SPACE); + AC_LOCAL_ADDR_SPACE); _mesa_hash_table_insert(ctx->vars, variable, shared); } } -static LLVMValueRef -emit_float_saturate(struct ac_llvm_context *ctx, LLVMValueRef v, float lo, float hi) -{ - v = ac_to_float(ctx, v); - v = emit_intrin_2f_param(ctx, "llvm.maxnum", ctx->f32, v, LLVMConstReal(ctx->f32, lo)); - return emit_intrin_2f_param(ctx, "llvm.minnum", ctx->f32, v, LLVMConstReal(ctx->f32, hi)); -} - - -static LLVMValueRef emit_pack_int16(struct nir_to_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1) -{ - LLVMValueRef const16 = LLVMConstInt(ctx->ac.i32, 16, false); - LLVMValueRef comp[2]; - - comp[0] = LLVMBuildAnd(ctx->builder, src0, LLVMConstInt(ctx->ac.i32, 65535, 0), ""); - comp[1] = LLVMBuildAnd(ctx->builder, src1, LLVMConstInt(ctx->ac.i32, 65535, 0), ""); - comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, ""); - return LLVMBuildOr(ctx->builder, comp[0], comp[1], ""); -} - /* Initialize arguments for the shader export intrinsic */ static void si_llvm_init_export_args(struct nir_to_llvm_context *ctx, @@ -5652,15 +5763,16 @@ si_llvm_init_export_args(struct nir_to_llvm_context *ctx, args->out[2] = LLVMGetUndef(ctx->ac.f32); args->out[3] = LLVMGetUndef(ctx->ac.f32); - if (!values) - return; - if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) { - LLVMValueRef val[4]; unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; + unsigned chan; + + LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; + LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], + unsigned bits, bool hi) = NULL; switch(col_format) { case V_028714_SPI_SHADER_ZERO: @@ -5686,106 +5798,91 @@ si_llvm_init_export_args(struct nir_to_llvm_context *ctx, break; case V_028714_SPI_SHADER_FP16_ABGR: - args->compr = 1; - - for (unsigned chan = 0; chan < 2; chan++) { - LLVMValueRef pack_args[2] = { - values[2 * chan], - values[2 * chan + 1] - }; - LLVMValueRef packed; - - packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args); - args->out[chan] = packed; - } + packf = ac_build_cvt_pkrtz_f16; break; case V_028714_SPI_SHADER_UNORM16_ABGR: - for (unsigned chan = 0; chan < 4; chan++) { - val[chan] = ac_build_clamp(&ctx->ac, values[chan]); - val[chan] = LLVMBuildFMul(ctx->builder, val[chan], - LLVMConstReal(ctx->ac.f32, 65535), ""); - val[chan] = LLVMBuildFAdd(ctx->builder, val[chan], - LLVMConstReal(ctx->ac.f32, 0.5), ""); - val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan], - ctx->ac.i32, ""); - } - - args->compr = 1; - args->out[0] = emit_pack_int16(ctx, val[0], val[1]); - args->out[1] = emit_pack_int16(ctx, val[2], val[3]); + packf = ac_build_cvt_pknorm_u16; break; case V_028714_SPI_SHADER_SNORM16_ABGR: - for (unsigned chan = 0; chan < 4; chan++) { - val[chan] = emit_float_saturate(&ctx->ac, values[chan], -1, 1); - val[chan] = LLVMBuildFMul(ctx->builder, val[chan], - LLVMConstReal(ctx->ac.f32, 32767), ""); - - /* If positive, add 0.5, else add -0.5. */ - val[chan] = LLVMBuildFAdd(ctx->builder, val[chan], - LLVMBuildSelect(ctx->builder, - LLVMBuildFCmp(ctx->builder, LLVMRealOGE, - val[chan], ctx->ac.f32_0, ""), - LLVMConstReal(ctx->ac.f32, 0.5), - LLVMConstReal(ctx->ac.f32, -0.5), ""), ""); - val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->ac.i32, ""); - } - - args->compr = 1; - args->out[0] = emit_pack_int16(ctx, val[0], val[1]); - args->out[1] = emit_pack_int16(ctx, val[2], val[3]); + packf = ac_build_cvt_pknorm_i16; break; - case V_028714_SPI_SHADER_UINT16_ABGR: { - LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32, - is_int8 ? 255 : is_int10 ? 1023 : 65535, 0); - LLVMValueRef max_alpha = !is_int10 ? max_rgb : LLVMConstInt(ctx->ac.i32, 3, 0); + case V_028714_SPI_SHADER_UINT16_ABGR: + packi = ac_build_cvt_pk_u16; + break; - for (unsigned chan = 0; chan < 4; chan++) { - val[chan] = ac_to_integer(&ctx->ac, values[chan]); - val[chan] = emit_minmax_int(&ctx->ac, LLVMIntULT, val[chan], chan == 3 ? max_alpha : max_rgb); - } + case V_028714_SPI_SHADER_SINT16_ABGR: + packi = ac_build_cvt_pk_i16; + break; - args->compr = 1; - args->out[0] = emit_pack_int16(ctx, val[0], val[1]); - args->out[1] = emit_pack_int16(ctx, val[2], val[3]); + default: + case V_028714_SPI_SHADER_32_ABGR: + memcpy(&args->out[0], values, sizeof(values[0]) * 4); break; } - case V_028714_SPI_SHADER_SINT16_ABGR: { - LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32, - is_int8 ? 127 : is_int10 ? 511 : 32767, 0); - LLVMValueRef min_rgb = LLVMConstInt(ctx->ac.i32, - is_int8 ? -128 : is_int10 ? -512 : -32768, 0); - LLVMValueRef max_alpha = !is_int10 ? max_rgb : ctx->ac.i32_1; - LLVMValueRef min_alpha = !is_int10 ? min_rgb : LLVMConstInt(ctx->ac.i32, -2, 0); + /* Pack f16 or norm_i16/u16. */ + if (packf) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + values[2 * chan], + values[2 * chan + 1] + }; + LLVMValueRef packed; - /* Clamp. */ - for (unsigned chan = 0; chan < 4; chan++) { - val[chan] = ac_to_integer(&ctx->ac, values[chan]); - val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSLT, val[chan], chan == 3 ? max_alpha : max_rgb); - val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSGT, val[chan], chan == 3 ? min_alpha : min_rgb); + packed = packf(&ctx->ac, pack_args); + args->out[chan] = ac_to_float(&ctx->ac, packed); } - - args->compr = 1; - args->out[0] = emit_pack_int16(ctx, val[0], val[1]); - args->out[1] = emit_pack_int16(ctx, val[2], val[3]); - break; + args->compr = 1; /* COMPR flag */ } - default: - case V_028714_SPI_SHADER_32_ABGR: - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - break; + /* Pack i16/u16. */ + if (packi) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + ac_to_integer(&ctx->ac, values[2 * chan]), + ac_to_integer(&ctx->ac, values[2 * chan + 1]) + }; + LLVMValueRef packed; + + packed = packi(&ctx->ac, pack_args, + is_int8 ? 8 : is_int10 ? 10 : 16, + chan == 1); + args->out[chan] = ac_to_float(&ctx->ac, packed); + } + args->compr = 1; /* COMPR flag */ } - } else - memcpy(&args->out[0], values, sizeof(values[0]) * 4); + return; + } + + memcpy(&args->out[0], values, sizeof(values[0]) * 4); for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); } +static void +radv_export_param(struct nir_to_llvm_context *ctx, unsigned index, + LLVMValueRef *values) +{ + struct ac_export_args args; + + si_llvm_init_export_args(ctx, values, + V_008DFC_SQ_EXP_PARAM + index, &args); + ac_build_export(&ctx->ac, &args); +} + +static LLVMValueRef +radv_load_output(struct nir_to_llvm_context *ctx, unsigned index, unsigned chan) +{ + LLVMValueRef output = + ctx->nir->outputs[radeon_llvm_reg_index_soa(index, chan)]; + + return LLVMBuildLoad(ctx->builder, output, ""); +} + static void handle_vs_outputs_post(struct nir_to_llvm_context *ctx, bool export_prim_id, @@ -5822,8 +5919,7 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx, i = VARYING_SLOT_CLIP_DIST0; for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++) - slots[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++) slots[i] = LLVMGetUndef(ctx->ac.f32); @@ -5845,27 +5941,23 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx, LLVMValueRef pos_values[4] = {ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_1}; if (ctx->output_mask & (1ull << VARYING_SLOT_POS)) { for (unsigned j = 0; j < 4; j++) - pos_values[j] = LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_POS, j)], ""); + pos_values[j] = radv_load_output(ctx, VARYING_SLOT_POS, j); } si_llvm_init_export_args(ctx, pos_values, V_008DFC_SQ_EXP_POS, &pos_args[0]); if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { outinfo->writes_pointsize = true; - psize_value = LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_PSIZ, 0)], ""); + psize_value = radv_load_output(ctx, VARYING_SLOT_PSIZ, 0); } if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { outinfo->writes_layer = true; - layer_value = LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)], ""); + layer_value = radv_load_output(ctx, VARYING_SLOT_LAYER, 0); } if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { outinfo->writes_viewport_index = true; - viewport_index_value = LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_VIEWPORT, 0)], ""); + viewport_index_value = radv_load_output(ctx, VARYING_SLOT_VIEWPORT, 0); } if (outinfo->writes_pointsize || @@ -5929,50 +6021,31 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx, if (!(ctx->output_mask & (1ull << i))) continue; - for (unsigned j = 0; j < 4; j++) - values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], "")); - - if (i == VARYING_SLOT_LAYER) { - target = V_008DFC_SQ_EXP_PARAM + param_count; - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = param_count; - param_count++; - } else if (i == VARYING_SLOT_PRIMITIVE_ID) { - target = V_008DFC_SQ_EXP_PARAM + param_count; - outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count; - param_count++; - } else if (i >= VARYING_SLOT_VAR0) { - outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0); - target = V_008DFC_SQ_EXP_PARAM + param_count; - outinfo->vs_output_param_offset[i] = param_count; - param_count++; - } else + if (i != VARYING_SLOT_LAYER && + i != VARYING_SLOT_PRIMITIVE_ID && + i < VARYING_SLOT_VAR0) continue; - si_llvm_init_export_args(ctx, values, target, &args); + for (unsigned j = 0; j < 4; j++) + values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); - if (target >= V_008DFC_SQ_EXP_POS && - target <= (V_008DFC_SQ_EXP_POS + 3)) { - memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); - } else { - ac_build_export(&ctx->ac, &args); - } + radv_export_param(ctx, param_count, values); + + outinfo->vs_output_param_offset[i] = param_count++; } if (export_prim_id) { LLVMValueRef values[4]; - target = V_008DFC_SQ_EXP_PARAM + param_count; - outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count; - param_count++; values[0] = ctx->vs_prim_id; ctx->shader_info->vs.vgpr_comp_cnt = MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt); for (unsigned j = 1; j < 4; j++) values[j] = ctx->ac.f32_0; - si_llvm_init_export_args(ctx, values, target, &args); - ac_build_export(&ctx->ac, &args); + + radv_export_param(ctx, param_count, values); + + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++; outinfo->export_prim_id = true; } @@ -6331,12 +6404,12 @@ handle_tcs_outputs_post(struct nir_to_llvm_context *ctx) static bool si_export_mrt_color(struct nir_to_llvm_context *ctx, - LLVMValueRef *color, unsigned param, bool is_last, + LLVMValueRef *color, unsigned index, bool is_last, struct ac_export_args *args) { /* Export */ - si_llvm_init_export_args(ctx, color, param, - args); + si_llvm_init_export_args(ctx, color, + V_008DFC_SQ_EXP_MRT + index, args); if (is_last) { args->valid_mask = 1; /* whether the EXEC mask is valid */ @@ -6368,47 +6441,52 @@ handle_fs_outputs_post(struct nir_to_llvm_context *ctx) for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { LLVMValueRef values[4]; + bool last = false; if (!(ctx->output_mask & (1ull << i))) continue; - if (i == FRAG_RESULT_DEPTH) { - ctx->shader_info->fs.writes_z = true; - depth = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], "")); - } else if (i == FRAG_RESULT_STENCIL) { - ctx->shader_info->fs.writes_stencil = true; - stencil = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], "")); - } else if (i == FRAG_RESULT_SAMPLE_MASK) { - ctx->shader_info->fs.writes_sample_mask = true; - samplemask = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], "")); - } else { - bool last = false; - for (unsigned j = 0; j < 4; j++) - values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder, - ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + if (i < FRAG_RESULT_DATA0) + continue; - if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask) - last = ctx->output_mask <= ((1ull << (i + 1)) - 1); + for (unsigned j = 0; j < 4; j++) + values[j] = ac_to_float(&ctx->ac, + radv_load_output(ctx, i, j)); - bool ret = si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + (i - FRAG_RESULT_DATA0), last, &color_args[index]); - if (ret) - index++; - } + if (!ctx->shader_info->info.ps.writes_z && + !ctx->shader_info->info.ps.writes_stencil && + !ctx->shader_info->info.ps.writes_sample_mask) + last = ctx->output_mask <= ((1ull << (i + 1)) - 1); + + bool ret = si_export_mrt_color(ctx, values, + i - FRAG_RESULT_DATA0, + last, &color_args[index]); + if (ret) + index++; + } + + /* Process depth, stencil, samplemask. */ + if (ctx->shader_info->info.ps.writes_z) { + depth = ac_to_float(&ctx->ac, + radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); + } + if (ctx->shader_info->info.ps.writes_stencil) { + stencil = ac_to_float(&ctx->ac, + radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); + } + if (ctx->shader_info->info.ps.writes_sample_mask) { + samplemask = ac_to_float(&ctx->ac, + radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); } + /* Export PS outputs. */ for (unsigned i = 0; i < index; i++) ac_build_export(&ctx->ac, &color_args[i]); + if (depth || stencil || samplemask) radv_export_mrt_z(ctx, depth, stencil, samplemask); - else if (!index) { - si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true, &color_args[0]); - ac_build_export(&ctx->ac, &color_args[0]); - } - - ctx->shader_info->fs.output_mask = index ? ((1ull << index) - 1) : 0; + else if (!index) + ac_build_export_null(&ctx->ac); } static void @@ -6629,8 +6707,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, visit_cf_list(&ctx, &func->impl->body); phi_post_pass(&ctx); - ctx.abi->emit_outputs(ctx.abi, RADEON_LLVM_MAX_OUTPUTS, - ctx.outputs); + if (nir->info.stage != MESA_SHADER_COMPUTE) + ctx.abi->emit_outputs(ctx.abi, RADEON_LLVM_MAX_OUTPUTS, + ctx.outputs); free(ctx.locals); ralloc_free(ctx.defs); @@ -6666,7 +6745,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, LLVMDisposeTargetData(data_layout); LLVMDisposeMessage(data_layout_str); - ctx.builder = LLVMCreateBuilderInContext(ctx.context); + enum ac_float_mode float_mode = + options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : + AC_FLOAT_MODE_DEFAULT; + + ctx.builder = ac_create_builder(ctx.context, float_mode); ctx.ac.builder = ctx.builder; memset(shader_info, 0, sizeof(*shader_info)); @@ -6715,21 +6798,21 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, ctx.gs_next_vertex = ac_build_alloca(&ctx.ac, ctx.ac.i32, "gs_next_vertex"); ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out; ctx.abi.load_inputs = load_gs_input; + ctx.abi.emit_primitive = visit_end_primitive; } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { ctx.tcs_outputs_read = shaders[i]->info.outputs_read; ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read; - ctx.abi.load_tess_inputs = load_tcs_input; + ctx.abi.load_tess_varyings = load_tcs_varyings; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode; - ctx.abi.load_tess_inputs = load_tes_input; + ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) { if (shader_info->info.vs.needs_instance_id) { - if (ctx.ac.chip_class == GFX9 && - shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) { + if (ctx.options->key.vs.as_ls) { ctx.shader_info->vs.vgpr_comp_cnt = MAX2(2, ctx.shader_info->vs.vgpr_comp_cnt); } else { @@ -6739,6 +6822,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, } } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard; + ctx.abi.lookup_interp_param = lookup_interp_param; + ctx.abi.load_sample_position = load_sample_position; } if (i) @@ -6796,6 +6881,9 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, LLVMBuildRetVoid(ctx.builder); + if (options->dump_preoptir) + ac_dump_module(ctx.module); + ac_llvm_finalize_module(&ctx); if (shader_count == 1) @@ -7012,16 +7100,9 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm, static void ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx) { - LLVMValueRef args[9]; - args[0] = ctx->gsvs_ring; - args[1] = LLVMBuildMul(ctx->builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 4, false), ""); - args[3] = ctx->ac.i32_0; - args[4] = ctx->ac.i32_1; /* OFFEN */ - args[5] = ctx->ac.i32_0; /* IDXEN */ - args[6] = ctx->ac.i32_1; /* GLC */ - args[7] = ctx->ac.i32_1; /* SLC */ - args[8] = ctx->ac.i32_0; /* TFE */ - + LLVMValueRef vtx_offset = + LLVMBuildMul(ctx->builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 4, false), ""); int idx = 0; for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { @@ -7039,16 +7120,16 @@ ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx) } for (unsigned j = 0; j < length; j++) { - LLVMValueRef value; - args[2] = LLVMConstInt(ctx->ac.i32, + LLVMValueRef value, soffset; + + soffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices * 16 * 4, false); - value = ac_build_intrinsic(&ctx->ac, - "llvm.SI.buffer.load.dword.i32.i32", - ctx->ac.i32, args, 9, - AC_FUNC_ATTR_READONLY | - AC_FUNC_ATTR_LEGACY); + value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring, + 1, ctx->ac.i32_0, + vtx_offset, soffset, + 0, 1, 1, true, false); LLVMBuildStore(ctx->builder, ac_to_float(&ctx->ac, value), ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)]); @@ -7079,7 +7160,11 @@ void ac_create_gs_copy_shader(LLVMTargetMachineRef tm, ctx.is_gs_copy_shader = true; LLVMSetTarget(ctx.module, "amdgcn--"); - ctx.builder = LLVMCreateBuilderInContext(ctx.context); + enum ac_float_mode float_mode = + options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : + AC_FLOAT_MODE_DEFAULT; + + ctx.builder = ac_create_builder(ctx.context, float_mode); ctx.ac.builder = ctx.builder; ctx.stage = MESA_SHADER_VERTEX;