X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_nir.cpp;h=709aeacf38fdb1f8c7d9ca27148e0e10a268b9c9;hb=18b28b5654e23c074e5158b2ce35c691733d7262;hp=bbcbc0fa3d2c552a7a9d1b19a25f9f8a6754d301;hpb=14531d676b11999123c04fb7569ab80c9f150180;p=mesa.git diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index bbcbc0fa3d2..709aeacf38f 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -24,6 +24,8 @@ #include "compiler/glsl/ir.h" #include "brw_fs.h" #include "brw_nir.h" +#include "brw_eu.h" +#include "nir_search_helpers.h" #include "util/u_math.h" #include "util/bitscan.h" @@ -32,6 +34,8 @@ using namespace brw; void fs_visitor::emit_nir_code() { + emit_shader_float_controls_execution_mode(); + /* emit the arrays used for inputs and outputs - load/store intrinsics will * be converted to reads/writes of these arrays */ @@ -343,7 +347,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl) unsigned array_elems = reg->num_array_elems == 0 ? 1 : reg->num_array_elems; unsigned size = array_elems * reg->num_components; - const brw_reg_type reg_type = + const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); nir_locals[reg->index] = bld.vgrf(reg_type, size); } @@ -387,7 +391,7 @@ fs_visitor::nir_emit_if(nir_if *if_stmt) /* If the condition has the form !other_condition, use other_condition as * the source, but invert the predicate on the if instruction. */ - nir_alu_instr *const cond = nir_src_as_alu_instr(&if_stmt->condition); + nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); if (cond != NULL && cond->op == nir_op_inot) { assert(!cond->src[0].negate); assert(!cond->src[0].abs); @@ -408,10 +412,10 @@ fs_visitor::nir_emit_if(nir_if *if_stmt) nir_emit_cf_list(&if_stmt->then_list); - /* note: if the else is empty, dead CF elimination will remove it */ - bld.emit(BRW_OPCODE_ELSE); - - nir_emit_cf_list(&if_stmt->else_list); + if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { + bld.emit(BRW_OPCODE_ELSE); + nir_emit_cf_list(&if_stmt->else_list); + } bld.emit(BRW_OPCODE_ENDIF); @@ -449,11 +453,11 @@ fs_visitor::nir_emit_instr(nir_instr *instr) switch (instr->type) { case nir_instr_type_alu: - nir_emit_alu(abld, nir_instr_as_alu(instr)); + nir_emit_alu(abld, nir_instr_as_alu(instr), true); break; case nir_instr_type_deref: - /* Derefs can exist for images but they do nothing */ + unreachable("All derefs should've been lowered"); break; case nir_instr_type_intrinsic: @@ -558,14 +562,8 @@ bool fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result) { - if (!instr->src[0].src.is_ssa || - instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *src0 = - nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr); - - if (src0->intrinsic != nir_intrinsic_load_front_face) + nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); + if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) return false; if (!nir_src_is_const(instr->src[1].src) || @@ -582,7 +580,24 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, fs_reg tmp = vgrf(glsl_type::int_type); - if (devinfo->gen >= 6) { + if (devinfo->gen >= 12) { + /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate the result for (gl_FrontFacing ? -1.0 : 1.0). + */ + bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g1, brw_imm_uw(0x3f80)); + + if (value1 == -1.0f) + bld.MOV(tmp, negate(tmp)); + + } else if (devinfo->gen >= 6) { /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); @@ -688,6 +703,16 @@ brw_rnd_mode_from_nir_op (const nir_op op) { } } +static brw_rnd_mode +brw_rnd_mode_from_execution_mode(unsigned execution_mode) +{ + if (nir_has_any_rounding_mode_rtne(execution_mode)) + return BRW_RND_MODE_RTNE; + if (nir_has_any_rounding_mode_rtz(execution_mode)) + return BRW_RND_MODE_RTZ; + return BRW_RND_MODE_UNSPECIFIED; +} + fs_reg fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, nir_alu_instr *instr, @@ -716,8 +741,7 @@ fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, * instructions. */ switch (instr->op) { - case nir_op_imov: - case nir_op_fmov: + case nir_op_mov: case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: @@ -754,8 +778,7 @@ fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, fs_reg *op) { for (unsigned i = 0; i < 2; i++) { - nir_alu_instr *const inot_instr = - nir_src_as_alu_instr(&instr->src[i].src); + nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); if (inot_instr != NULL && inot_instr->op == nir_op_inot && !inot_instr->src[0].abs && !inot_instr->src[0].negate) { @@ -778,7 +801,7 @@ fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, if (devinfo->gen < 6 || devinfo->gen >= 12) return false; - nir_alu_instr *const inot_instr = nir_src_as_alu_instr(&instr->src[0].src); + nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); if (inot_instr == NULL || inot_instr->op != nir_op_inot) return false; @@ -808,18 +831,199 @@ fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, return true; } +/** + * Emit code for nir_op_fsign possibly fused with a nir_op_fmul + * + * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of + * the source of \c instr that is a \c nir_op_fsign. + */ void -fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) +fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, + fs_reg result, fs_reg *op, unsigned fsign_src) +{ + fs_inst *inst; + + assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); + assert(fsign_src < nir_op_infos[instr->op].num_inputs); + + if (instr->op != nir_op_fsign) { + const nir_alu_instr *const fsign_instr = + nir_src_as_alu_instr(instr->src[fsign_src].src); + + assert(!fsign_instr->dest.saturate); + + /* op[fsign_src] has the nominal result of the fsign, and op[1 - + * fsign_src] has the other multiply source. This must be rearranged so + * that op[0] is the source of the fsign op[1] is the other multiply + * source. + */ + if (fsign_src != 0) + op[1] = op[0]; + + op[0] = get_nir_src(fsign_instr->src[0].src); + + const nir_alu_type t = + (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | + nir_src_bit_size(fsign_instr->src[0].src)); + + op[0].type = brw_type_for_nir_type(devinfo, t); + op[0].abs = fsign_instr->src[0].abs; + op[0].negate = fsign_instr->src[0].negate; + + unsigned channel = 0; + if (nir_op_infos[instr->op].output_size == 0) { + /* Since NIR is doing the scalarizing for us, we should only ever see + * vectorized operations with a single channel. + */ + assert(util_bitcount(instr->dest.write_mask) == 1); + channel = ffs(instr->dest.write_mask) - 1; + } + + op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); + } else { + assert(!instr->dest.saturate); + } + + if (op[0].abs) { + /* Straightforward since the source can be assumed to be either strictly + * >= 0 or strictly <= 0 depending on the setting of the negate flag. + */ + set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); + + if (instr->op == nir_op_fsign) { + inst = (op[0].negate) + ? bld.MOV(result, brw_imm_f(-1.0f)) + : bld.MOV(result, brw_imm_f(1.0f)); + } else { + op[1].negate = (op[0].negate != op[1].negate); + inst = bld.MOV(result, op[1]); + } + + set_predicate(BRW_PREDICATE_NORMAL, inst); + } else if (type_sz(op[0].type) == 2) { + /* AND(val, 0x8000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. + */ + fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); + bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); + + op[0].type = BRW_REGISTER_TYPE_UW; + result.type = BRW_REGISTER_TYPE_UW; + bld.AND(result, op[0], brw_imm_uw(0x8000u)); + + if (instr->op == nir_op_fsign) + inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); + else { + /* Use XOR here to get the result sign correct. */ + inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); + } + + inst->predicate = BRW_PREDICATE_NORMAL; + } else if (type_sz(op[0].type) == 4) { + /* AND(val, 0x80000000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not + * zero. + */ + bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); + + op[0].type = BRW_REGISTER_TYPE_UD; + result.type = BRW_REGISTER_TYPE_UD; + bld.AND(result, op[0], brw_imm_ud(0x80000000u)); + + if (instr->op == nir_op_fsign) + inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); + else { + /* Use XOR here to get the result sign correct. */ + inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); + } + + inst->predicate = BRW_PREDICATE_NORMAL; + } else { + /* For doubles we do the same but we need to consider: + * + * - 2-src instructions can't operate with 64-bit immediates + * - The sign is encoded in the high 32-bit of each DF + * - We need to produce a DF result. + */ + + fs_reg zero = vgrf(glsl_type::double_type); + bld.MOV(zero, setup_imm_df(bld, 0.0)); + bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); + + bld.MOV(result, zero); + + fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); + bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), + brw_imm_ud(0x80000000u)); + + if (instr->op == nir_op_fsign) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.OR(r, r, brw_imm_ud(0x3ff00000u))); + } else { + /* This could be done better in some cases. If the scale is an + * immediate with the low 32-bits all 0, emitting a separate XOR and + * OR would allow an algebraic optimization to remove the OR. There + * are currently zero instances of fsign(double(x))*IMM in shader-db + * or any test suite, so it is hard to care at this time. + */ + fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); + inst = bld.XOR(result_int64, result_int64, + retype(op[1], BRW_REGISTER_TYPE_UQ)); + } + } +} + +/** + * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign + * + * Checks the operands of a \c nir_op_fmul to determine whether or not + * \c emit_fsign could fuse the multiplication with the \c sign() calculation. + * + * \param instr The multiplication instruction + * + * \param fsign_src The source of \c instr that may or may not be a + * \c nir_op_fsign + */ +static bool +can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) +{ + assert(instr->op == nir_op_fmul); + + nir_alu_instr *const fsign_instr = + nir_src_as_alu_instr(instr->src[fsign_src].src); + + /* Rules: + * + * 1. instr->src[fsign_src] must be a nir_op_fsign. + * 2. The nir_op_fsign can only be used by this multiplication. + * 3. The source that is the nir_op_fsign does not have source modifiers. + * \c emit_fsign only examines the source modifiers of the source of the + * \c nir_op_fsign. + * + * The nir_op_fsign must also not have the saturate modifier, but steps + * have already been taken (in nir_opt_algebraic) to ensure that. + */ + return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && + is_used_once(fsign_instr) && + !instr->src[fsign_src].abs && !instr->src[fsign_src].negate; +} + +void +fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, + bool need_dest) { struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; fs_inst *inst; + unsigned execution_mode = + bld.shader->nir->info.float_controls_execution_mode; fs_reg op[4]; - fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true); + fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); switch (instr->op) { - case nir_op_imov: - case nir_op_fmov: + case nir_op_mov: case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: { @@ -838,7 +1042,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) if (!(instr->dest.write_mask & (1 << i))) continue; - if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { + if (instr->op == nir_op_mov) { inst = bld.MOV(offset(temp, bld, i), offset(op[0], bld, instr->src[0].swizzle[i])); } else { @@ -873,9 +1077,16 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_f2f16_rtne: case nir_op_f2f16_rtz: - bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), - brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); - /* fallthrough */ + case nir_op_f2f16: { + brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; + + if (nir_op_f2f16 == instr->op) + rnd = brw_rnd_mode_from_execution_mode(execution_mode); + else + rnd = brw_rnd_mode_from_nir_op(instr->op); + + if (BRW_RND_MODE_UNSPECIFIED != rnd) + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending * on the HW gen, it is a special hw opcode or just a MOV, and @@ -886,11 +1097,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * only for gen8+, it will be better to use directly the MOV, and use * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7. */ - - case nir_op_f2f16: + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; + } case nir_op_b2i8: case nir_op_b2i16: @@ -904,82 +1115,81 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) op[0].type = BRW_REGISTER_TYPE_D; op[0].negate = !op[0].negate; /* fallthrough */ - case nir_op_f2f64: - case nir_op_f2i64: - case nir_op_f2u64: case nir_op_i2f64: case nir_op_i2i64: case nir_op_u2f64: case nir_op_u2u64: - case nir_op_f2f32: - case nir_op_f2i32: - case nir_op_f2u32: - case nir_op_f2i16: - case nir_op_f2u16: + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: case nir_op_i2i32: case nir_op_u2u32: - case nir_op_i2i16: - case nir_op_u2u16: + case nir_op_f2i32: + case nir_op_f2u32: case nir_op_i2f16: + case nir_op_i2i16: case nir_op_u2f16: + case nir_op_u2u16: + case nir_op_f2i16: + case nir_op_f2u16: case nir_op_i2i8: case nir_op_u2u8: + case nir_op_f2i8: + case nir_op_f2u8: + if (result.type == BRW_REGISTER_TYPE_B || + result.type == BRW_REGISTER_TYPE_UB || + result.type == BRW_REGISTER_TYPE_HF) + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + + if (op[0].type == BRW_REGISTER_TYPE_B || + op[0].type == BRW_REGISTER_TYPE_UB || + op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; - case nir_op_fsign: { - assert(!instr->dest.saturate); - if (op[0].abs) { - /* Straightforward since the source can be assumed to be either - * strictly >= 0 or strictly <= 0 depending on the setting of the - * negate flag. - */ - set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); - - inst = (op[0].negate) - ? bld.MOV(result, brw_imm_f(-1.0f)) - : bld.MOV(result, brw_imm_f(1.0f)); - - set_predicate(BRW_PREDICATE_NORMAL, inst); - } else if (type_sz(op[0].type) < 8) { - /* AND(val, 0x80000000) gives the sign bit. - * - * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not - * zero. - */ - bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); + case nir_op_fsat: + inst = bld.MOV(result, op[0]); + inst->saturate = true; + break; - fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); - op[0].type = BRW_REGISTER_TYPE_UD; - result.type = BRW_REGISTER_TYPE_UD; - bld.AND(result_int, op[0], brw_imm_ud(0x80000000u)); + case nir_op_fneg: + case nir_op_ineg: + op[0].negate = true; + inst = bld.MOV(result, op[0]); + if (instr->op == nir_op_fneg) + inst->saturate = instr->dest.saturate; + break; - inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); - inst->predicate = BRW_PREDICATE_NORMAL; - } else { - /* For doubles we do the same but we need to consider: - * - * - 2-src instructions can't operate with 64-bit immediates - * - The sign is encoded in the high 32-bit of each DF - * - We need to produce a DF result. - */ + case nir_op_fabs: + case nir_op_iabs: + op[0].negate = false; + op[0].abs = true; + inst = bld.MOV(result, op[0]); + if (instr->op == nir_op_fabs) + inst->saturate = instr->dest.saturate; + break; - fs_reg zero = vgrf(glsl_type::double_type); - bld.MOV(zero, setup_imm_df(bld, 0.0)); - bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); + case nir_op_f2f32: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } - bld.MOV(result, zero); + if (op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ - fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); - bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), - brw_imm_ud(0x80000000u)); + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; - set_predicate(BRW_PREDICATE_NORMAL, - bld.OR(r, r, brw_imm_ud(0x3ff00000u))); - } + case nir_op_fsign: + emit_fsign(bld, instr, result, op, 0); break; - } case nir_op_frcp: inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); @@ -1039,8 +1249,15 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_iadd: case nir_op_fadd: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + /* fallthrough */ + case nir_op_iadd: inst = bld.ADD(result, op[0], op[1]); inst->saturate = instr->dest.saturate; break; @@ -1051,6 +1268,24 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) break; case nir_op_fmul: + for (unsigned i = 0; i < 2; i++) { + if (can_fuse_fmul_fsign(instr, i)) { + emit_fsign(bld, instr, result, op, i); + return; + } + } + + /* We emit the rounding mode after the previous fsign optimization since + * it won't result in a MUL, but will try to negate the value by other + * means. + */ + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MUL(result, op[0], op[1]); inst->saturate = instr->dest.saturate; break; @@ -1134,25 +1369,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) if (bit_size != 32) dest = bld.vgrf(op[0].type, 1); - brw_conditional_mod cond; - switch (instr->op) { - case nir_op_flt32: - cond = BRW_CONDITIONAL_L; - break; - case nir_op_fge32: - cond = BRW_CONDITIONAL_GE; - break; - case nir_op_feq32: - cond = BRW_CONDITIONAL_Z; - break; - case nir_op_fne32: - cond = BRW_CONDITIONAL_NZ; - break; - default: - unreachable("bad opcode"); - } - - bld.CMP(dest, op[0], op[1], cond); + bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); if (bit_size > 32) { bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); @@ -1176,30 +1393,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_ine32: { fs_reg dest = result; + /* On Gen11 we have an additional issue being that src1 cannot be a byte + * type. So we convert both operands for the comparison. + */ + fs_reg temp_op[2]; + temp_op[0] = bld.fix_byte_src(op[0]); + temp_op[1] = bld.fix_byte_src(op[1]); + const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); if (bit_size != 32) - dest = bld.vgrf(op[0].type, 1); + dest = bld.vgrf(temp_op[0].type, 1); - brw_conditional_mod cond; - switch (instr->op) { - case nir_op_ilt32: - case nir_op_ult32: - cond = BRW_CONDITIONAL_L; - break; - case nir_op_ige32: - case nir_op_uge32: - cond = BRW_CONDITIONAL_GE; - break; - case nir_op_ieq32: - cond = BRW_CONDITIONAL_Z; - break; - case nir_op_ine32: - cond = BRW_CONDITIONAL_NZ; - break; - default: - unreachable("bad opcode"); - } - bld.CMP(dest, op[0], op[1], cond); + bld.CMP(dest, temp_op[0], temp_op[1], + brw_cmod_for_nir_comparison(instr->op)); if (bit_size > 32) { bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); @@ -1217,8 +1423,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_inot: if (devinfo->gen >= 8) { - nir_alu_instr *const inot_src_instr = - nir_src_as_alu_instr(&instr->src[0].src); + nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); if (inot_src_instr != NULL && (inot_src_instr->op == nir_op_ior || @@ -1466,11 +1671,18 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_pack_half_2x16: unreachable("not reached: should be handled by lower_packing_builtins"); + case nir_op_unpack_half_2x16_split_x_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_x: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); inst->saturate = instr->dest.saturate; break; + + case nir_op_unpack_half_2x16_split_y_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_y: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); @@ -1598,16 +1810,37 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) bld.SHR(result, op[0], op[1]); break; + case nir_op_urol: + bld.ROL(result, op[0], op[1]); + break; + case nir_op_uror: + bld.ROR(result, op[0], op[1]); + break; + case nir_op_pack_half_2x16_split: bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); break; case nir_op_ffma: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MAD(result, op[2], op[1], op[0]); inst->saturate = instr->dest.saturate; break; case nir_op_flrp: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.LRP(result, op[0], op[1], op[2]); inst->saturate = instr->dest.saturate; break; @@ -1675,6 +1908,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * to sign extend the low bit to 0/~0 */ if (devinfo->gen <= 5 && + !result.is_null() && (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { fs_reg masked = vgrf(glsl_type::int_type); bld.AND(masked, result, brw_imm_d(1)); @@ -1790,6 +2024,7 @@ fs_visitor::get_nir_dest(const nir_dest &dest) BRW_REGISTER_TYPE_F); nir_ssa_values[dest.ssa.index] = bld.vgrf(reg_type, dest.ssa.num_components); + bld.UNDEF(nir_ssa_values[dest.ssa.index]); return nir_ssa_values[dest.ssa.index]; } else { /* We don't handle indirects on locals */ @@ -2173,13 +2408,12 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, unsigned num_components, unsigned first_component) { + assert(type_sz(dst.type) == 4); struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; /* TODO: figure out push input layout for invocations == 1 */ - /* TODO: make this work with 64-bit inputs */ if (gs_prog_data->invocations == 1 && - type_sz(dst.type) <= 4 && nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + @@ -2273,87 +2507,50 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, } fs_inst *inst; - - fs_reg tmp_dst = dst; fs_reg indirect_offset = get_nir_src(offset_src); - unsigned num_iterations = 1; - unsigned orig_num_components = num_components; - if (type_sz(dst.type) == 8) { - if (num_components > 2) { - num_iterations = 2; - num_components = 2; - } - fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); - tmp_dst = tmp; - first_component = first_component / 2; - } - - for (unsigned iter = 0; iter < num_iterations; iter++) { - if (nir_src_is_const(offset_src)) { - /* Constant indexing - use global offset. */ - if (first_component != 0) { - unsigned read_components = num_components + first_component; - fs_reg tmp = bld.vgrf(dst.type, read_components); - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); - inst->size_written = read_components * - tmp.component_size(inst->exec_size); - for (unsigned i = 0; i < num_components; i++) { - bld.MOV(offset(tmp_dst, bld, i), - offset(tmp, bld, i + first_component)); - } - } else { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, - icp_handle); - inst->size_written = num_components * - tmp_dst.component_size(inst->exec_size); - } - inst->offset = base_offset + nir_src_as_uint(offset_src); - inst->mlen = 1; - } else { - /* Indirect indexing - use per-slot offsets as well. */ - const fs_reg srcs[] = { icp_handle, indirect_offset }; + if (nir_src_is_const(offset_src)) { + /* Constant indexing - use global offset. */ + if (first_component != 0) { unsigned read_components = num_components + first_component; fs_reg tmp = bld.vgrf(dst.type, read_components); - fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); - if (first_component != 0) { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, - payload); - inst->size_written = read_components * - tmp.component_size(inst->exec_size); - for (unsigned i = 0; i < num_components; i++) { - bld.MOV(offset(tmp_dst, bld, i), - offset(tmp, bld, i + first_component)); - } - } else { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, - payload); - inst->size_written = num_components * - tmp_dst.component_size(inst->exec_size); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); } - inst->offset = base_offset; - inst->mlen = 2; - } - - if (type_sz(dst.type) == 8) { - shuffle_from_32bit_read(bld, - offset(dst, bld, iter * 2), - retype(tmp_dst, BRW_REGISTER_TYPE_D), - 0, - num_components); + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); + inst->size_written = num_components * + dst.component_size(inst->exec_size); } - - if (num_iterations > 1) { - num_components = orig_num_components - 2; - if(nir_src_is_const(offset_src)) { - base_offset++; - } else { - fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); - indirect_offset = new_indirect; + inst->offset = base_offset + nir_src_as_uint(offset_src); + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { icp_handle, indirect_offset }; + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + if (first_component != 0) { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload); + inst->size_written = num_components * + dst.component_size(inst->exec_size); } + inst->offset = base_offset; + inst->mlen = 2; } } @@ -2390,20 +2587,13 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, unreachable("should be lowered by nir_lower_system_values()"); case nir_intrinsic_load_input: { + assert(nir_dest_bit_size(instr->dest) == 32); fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); - unsigned first_component = nir_intrinsic_component(instr); - unsigned num_components = instr->num_components; - + src = offset(src, bld, nir_intrinsic_component(instr)); src = offset(src, bld, nir_src_as_uint(instr->src[0])); - if (type_sz(dest.type) == 8) - first_component /= 2; - - /* For 16-bit support maybe a temporary will be needed to copy from - * the ATTR file. - */ - shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D), - first_component, num_components); + for (unsigned i = 0; i < instr->num_components; i++) + bld.MOV(offset(dest, bld, i), offset(src, bld, i)); break; } @@ -2421,6 +2611,116 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, } } +fs_reg +fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + const nir_src &vertex_src = instr->src[0]; + nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src); + fs_reg icp_handle; + + if (nir_src_is_const(vertex_src)) { + /* Emit a MOV to resolve <0,1,0> regioning. */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + unsigned vertex = nir_src_as_uint(vertex_src); + bld.MOV(icp_handle, + retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), + BRW_REGISTER_TYPE_UD)); + } else if (tcs_prog_data->instances == 1 && vertex_intrin && + vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) { + /* For the common case of only 1 instance, an array index of + * gl_InvocationID means reading g1. Skip all the indirect work. + */ + icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Each ICP handle is a single DWord (4 bytes) */ + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(2u)); + + /* Start at g1. We might read up to 4 registers. */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, + brw_imm_ud(4 * REG_SIZE)); + } + + return icp_handle; +} + +fs_reg +fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + const nir_src &vertex_src = instr->src[0]; + + unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2; + + if (nir_src_is_const(vertex_src)) { + return fs_reg(retype(brw_vec8_grf(first_icp_handle + + nir_src_as_uint(vertex_src), 0), + BRW_REGISTER_TYPE_UD)); + } + + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> + * indicating that channel should read the handle from + * DWord . We convert that to bytes by multiplying by 4. + * + * Next, we convert the vertex index to bytes by multiplying + * by 32 (shifting by 5), and add the two together. This is + * the final indirect byte offset. + */ + fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); + fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ + bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); + /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ + bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); + /* Convert vertex_index to bytes (multiply by 32) */ + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(5u)); + bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); + + /* Use first_icp_handle as the base offset. There is one register + * of URB handles per vertex, so inform the register allocator that + * we might read up to nir->info.gs.vertices_in registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), + icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE)); + + return icp_handle; +} + +struct brw_reg +fs_visitor::get_tcs_output_urb_handle() +{ + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) { + return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); + } else { + assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH); + return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); + } +} + void fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -2428,6 +2728,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, assert(stage == MESA_SHADER_TESS_CTRL); struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + + bool eight_patch = + vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH; fs_reg dst; if (nir_intrinsic_infos[instr->intrinsic].has_dest) @@ -2435,7 +2739,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, switch (instr->intrinsic) { case nir_intrinsic_load_primitive_id: - bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); + bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0) + : brw_vec1_grf(0, 1))); break; case nir_intrinsic_load_invocation_id: bld.MOV(retype(dst, invocation_id.type), invocation_id); @@ -2457,16 +2762,26 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, /* Zero the message header */ bld.exec_all().MOV(m0, brw_imm_ud(0u)); - /* Copy "Barrier ID" from r0.2, bits 16:13 */ - chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), - brw_imm_ud(INTEL_MASK(16, 13))); + if (devinfo->gen < 11) { + /* Copy "Barrier ID" from r0.2, bits 16:13 */ + chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(16, 13))); - /* Shift it up to bits 27:24. */ - chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); + /* Shift it up to bits 27:24. */ + chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); + } else { + chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(30, 24))); + } /* Set the Barrier Count and the enable bit */ - chanbld.OR(m0_2, m0_2, - brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); + if (devinfo->gen < 11) { + chanbld.OR(m0_2, m0_2, + brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); + } else { + chanbld.OR(m0_2, m0_2, + brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); + } bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); break; @@ -2477,153 +2792,90 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, break; case nir_intrinsic_load_per_vertex_input: { + assert(nir_dest_bit_size(instr->dest) == 32); fs_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; - - const nir_src &vertex_src = instr->src[0]; - fs_inst *inst; - fs_reg icp_handle; - - if (nir_src_is_const(vertex_src)) { - /* Emit a MOV to resolve <0,1,0> regioning. */ - icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - unsigned vertex = nir_src_as_uint(vertex_src); - bld.MOV(icp_handle, - retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), - BRW_REGISTER_TYPE_UD)); - } else if (tcs_prog_data->instances == 1 && - vertex_src.is_ssa && - vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic && - nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) { - /* For the common case of only 1 instance, an array index of - * gl_InvocationID means reading g1. Skip all the indirect work. - */ - icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); - } else { - /* The vertex index is non-constant. We need to use indirect - * addressing to fetch the proper URB handle. - */ - icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - - /* Each ICP handle is a single DWord (4 bytes) */ - fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - bld.SHL(vertex_offset_bytes, - retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), - brw_imm_ud(2u)); - - /* Start at g1. We might read up to 4 registers. */ - bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, - retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, - brw_imm_ud(4 * REG_SIZE)); - } + fs_reg icp_handle = + eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr) + : get_tcs_single_patch_icp_handle(bld, instr); /* We can only read two double components with each URB read, so * we send two read messages in that case, each one loading up to * two double components. */ - unsigned num_iterations = 1; unsigned num_components = instr->num_components; unsigned first_component = nir_intrinsic_component(instr); - fs_reg orig_dst = dst; - if (type_sz(dst.type) == 8) { - first_component = first_component / 2; - if (instr->num_components > 2) { - num_iterations = 2; - num_components = 2; - } - fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); - dst = tmp; - } - - for (unsigned iter = 0; iter < num_iterations; iter++) { - if (indirect_offset.file == BAD_FILE) { - /* Constant indexing - use global offset. */ - if (first_component != 0) { - unsigned read_components = num_components + first_component; - fs_reg tmp = bld.vgrf(dst.type, read_components); - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); - for (unsigned i = 0; i < num_components; i++) { - bld.MOV(offset(dst, bld, i), - offset(tmp, bld, i + first_component)); - } - } else { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); + if (indirect_offset.file == BAD_FILE) { + /* Constant indexing - use global offset. */ + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); } - inst->offset = imm_offset; - inst->mlen = 1; } else { - /* Indirect indexing - use per-slot offsets as well. */ - const fs_reg srcs[] = { icp_handle, indirect_offset }; - fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); - if (first_component != 0) { - unsigned read_components = num_components + first_component; - fs_reg tmp = bld.vgrf(dst.type, read_components); - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, - payload); - for (unsigned i = 0; i < num_components; i++) { - bld.MOV(offset(dst, bld, i), - offset(tmp, bld, i + first_component)); - } - } else { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, - payload); - } - inst->offset = imm_offset; - inst->mlen = 2; - } - inst->size_written = (num_components + first_component) * - inst->dst.component_size(inst->exec_size); - - /* If we are reading 64-bit data using 32-bit read messages we need - * build proper 64-bit data elements by shuffling the low and high - * 32-bit components around like we do for other things like UBOs - * or SSBOs. - */ - if (type_sz(dst.type) == 8) { - shuffle_from_32bit_read(bld, - offset(orig_dst, bld, iter * 2), - retype(dst, BRW_REGISTER_TYPE_D), - 0, num_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); } - - /* Copy the temporary to the destination to deal with writemasking. - * - * Also attempt to deal with gl_PointSize being in the .w component. - */ - if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { - assert(type_sz(dst.type) < 8); - inst->dst = bld.vgrf(dst.type, 4); - inst->size_written = 4 * REG_SIZE; - bld.MOV(dst, offset(inst->dst, bld, 3)); + inst->offset = imm_offset; + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { icp_handle, indirect_offset }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, + payload); } + inst->offset = imm_offset; + inst->mlen = 2; + } + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); - /* If we are loading double data and we need a second read message - * adjust the write offset - */ - if (num_iterations > 1) { - num_components = instr->num_components - 2; - imm_offset++; - } + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + assert(type_sz(dst.type) == 4); + inst->dst = bld.vgrf(dst.type, 4); + inst->size_written = 4 * REG_SIZE; + bld.MOV(dst, offset(inst->dst, bld, 3)); } break; } case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: { + assert(nir_dest_bit_size(instr->dest) == 32); fs_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; unsigned first_component = nir_intrinsic_component(instr); + struct brw_reg output_handles = get_tcs_output_urb_handle(); + fs_inst *inst; if (indirect_offset.file == BAD_FILE) { - /* Replicate the patch handle to all enabled channels */ + /* This MOV replicates the output handle to all enabled channels + * is SINGLE_PATCH mode. + */ fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - bld.MOV(patch_handle, - retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); + bld.MOV(patch_handle, output_handles); { if (first_component != 0) { @@ -2647,10 +2899,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, } } else { /* Indirect indexing - use per-slot offsets as well. */ - const fs_reg srcs[] = { - retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), - indirect_offset - }; + const fs_reg srcs[] = { output_handles, indirect_offset }; fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); if (first_component != 0) { @@ -2677,15 +2926,16 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: { + assert(nir_src_bit_size(instr->src[0]) == 32); fs_reg value = get_nir_src(instr->src[0]); - bool is_64bit = (instr->src[0].is_ssa ? - instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; fs_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; unsigned mask = instr->const_index[1]; unsigned header_regs = 0; + struct brw_reg output_handles = get_tcs_output_urb_handle(); + fs_reg srcs[7]; - srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); + srcs[header_regs++] = output_handles; if (indirect_offset.file != BAD_FILE) { srcs[header_regs++] = indirect_offset; @@ -2700,94 +2950,35 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, /* We can only pack two 64-bit components in a single message, so send * 2 messages if we have more components */ - unsigned num_iterations = 1; - unsigned iter_components = num_components; unsigned first_component = nir_intrinsic_component(instr); - if (is_64bit) { - first_component = first_component / 2; - if (instr->num_components > 2) { - num_iterations = 2; - iter_components = 2; - } - } - mask = mask << first_component; - for (unsigned iter = 0; iter < num_iterations; iter++) { - if (!is_64bit && mask != WRITEMASK_XYZW) { - srcs[header_regs++] = brw_imm_ud(mask << 16); - opcode = indirect_offset.file != BAD_FILE ? - SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : - SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; - } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { - /* Expand the 64-bit mask to 32-bit channels. We only handle - * two channels in each iteration, so we only care about X/Y. - */ - unsigned mask32 = 0; - if (mask & WRITEMASK_X) - mask32 |= WRITEMASK_XY; - if (mask & WRITEMASK_Y) - mask32 |= WRITEMASK_ZW; - - /* If the mask does not include any of the channels X or Y there - * is nothing to do in this iteration. Move on to the next couple - * of 64-bit channels. - */ - if (!mask32) { - mask >>= 2; - imm_offset++; - continue; - } - - srcs[header_regs++] = brw_imm_ud(mask32 << 16); - opcode = indirect_offset.file != BAD_FILE ? - SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : - SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; - } else { - opcode = indirect_offset.file != BAD_FILE ? - SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : - SHADER_OPCODE_URB_WRITE_SIMD8; - } - - for (unsigned i = 0; i < iter_components; i++) { - if (!(mask & (1 << (i + first_component)))) - continue; - - if (!is_64bit) { - srcs[header_regs + i + first_component] = offset(value, bld, i); - } else { - /* We need to shuffle the 64-bit data to match the layout - * expected by our 32-bit URB write messages. We use a temporary - * for that. - */ - unsigned channel = iter * 2 + i; - fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1); + if (mask != WRITEMASK_XYZW) { + srcs[header_regs++] = brw_imm_ud(mask << 16); + opcode = indirect_offset.file != BAD_FILE ? + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + } else { + opcode = indirect_offset.file != BAD_FILE ? + SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : + SHADER_OPCODE_URB_WRITE_SIMD8; + } - srcs[header_regs + (i + first_component) * 2] = dest; - srcs[header_regs + (i + first_component) * 2 + 1] = - offset(dest, bld, 1); - } - } + for (unsigned i = 0; i < num_components; i++) { + if (!(mask & (1 << (i + first_component)))) + continue; - unsigned mlen = - header_regs + (is_64bit ? 2 * iter_components : iter_components) + - (is_64bit ? 2 * first_component : first_component); - fs_reg payload = - bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); - bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); + srcs[header_regs + i + first_component] = offset(value, bld, i); + } - fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); - inst->offset = imm_offset; - inst->mlen = mlen; + unsigned mlen = header_regs + num_components + first_component; + fs_reg payload = + bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); + bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); - /* If this is a 64-bit attribute, select the next two 64-bit channels - * to be handled in the next iteration. - */ - if (is_64bit) { - mask >>= 2; - imm_offset++; - } - } + fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); + inst->offset = imm_offset; + inst->mlen = mlen; break; } @@ -2821,35 +3012,27 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: { + assert(nir_dest_bit_size(instr->dest) == 32); fs_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; unsigned first_component = nir_intrinsic_component(instr); - if (type_sz(dest.type) == 8) { - first_component = first_component / 2; - } - fs_inst *inst; if (indirect_offset.file == BAD_FILE) { /* Arbitrarily only push up to 32 vec4 slots worth of data, * which is 16 registers (since each holds 2 vec4 slots). */ - unsigned slot_count = 1; - if (type_sz(dest.type) == 8 && instr->num_components > 2) - slot_count++; - const unsigned max_push_slots = 32; - if (imm_offset + slot_count <= max_push_slots) { + if (imm_offset < max_push_slots) { fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); for (int i = 0; i < instr->num_components; i++) { - unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + - i + first_component; + unsigned comp = 4 * (imm_offset % 2) + i + first_component; bld.MOV(offset(dest, bld, i), component(src, comp)); } tes_prog_data->base.urb_read_length = MAX2(tes_prog_data->base.urb_read_length, - DIV_ROUND_UP(imm_offset + slot_count, 2)); + (imm_offset / 2) + 1); } else { /* Replicate the patch handle to all enabled channels */ const fs_reg srcs[] = { @@ -2884,65 +3067,32 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, * we send two read messages in that case, each one loading up to * two double components. */ - unsigned num_iterations = 1; unsigned num_components = instr->num_components; - fs_reg orig_dest = dest; - if (type_sz(dest.type) == 8) { - if (instr->num_components > 2) { - num_iterations = 2; - num_components = 2; - } - fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); - dest = tmp; - } - - for (unsigned iter = 0; iter < num_iterations; iter++) { - const fs_reg srcs[] = { - retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), - indirect_offset - }; - fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); - - if (first_component != 0) { - unsigned read_components = - num_components + first_component; - fs_reg tmp = bld.vgrf(dest.type, read_components); - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, - payload); - for (unsigned i = 0; i < num_components; i++) { - bld.MOV(offset(dest, bld, i), - offset(tmp, bld, i + first_component)); - } - } else { - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, - payload); - } - inst->mlen = 2; - inst->offset = imm_offset; - inst->size_written = (num_components + first_component) * - inst->dst.component_size(inst->exec_size); - - /* If we are reading 64-bit data using 32-bit read messages we need - * build proper 64-bit data elements by shuffling the low and high - * 32-bit components around like we do for other things like UBOs - * or SSBOs. - */ - if (type_sz(dest.type) == 8) { - shuffle_from_32bit_read(bld, - offset(orig_dest, bld, iter * 2), - retype(dest, BRW_REGISTER_TYPE_D), - 0, num_components); - } + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + indirect_offset + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); - /* If we are loading double data and we need a second read message - * adjust the offset - */ - if (num_iterations > 1) { - num_components = instr->num_components - 2; - imm_offset++; + if (first_component != 0) { + unsigned read_components = + num_components + first_component; + fs_reg tmp = bld.vgrf(dest.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + offset(tmp, bld, i + first_component)); } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, + payload); } + inst->mlen = 2; + inst->offset = imm_offset; + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); } break; } @@ -3069,7 +3219,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; const fs_reg mcs = wm_key->multisample_fbo ? - emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg(); + emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg(); /* Use either a normal or a CMS texel fetch message depending on whether * the framebuffer is single or multisample. On SKL+ use the wide CMS @@ -3162,6 +3312,44 @@ alloc_frag_output(fs_visitor *v, unsigned location) unreachable("Invalid location"); } +/* Annoyingly, we get the barycentrics into the shader in a layout that's + * optimized for PLN but it doesn't work nearly as well as one would like for + * manual interpolation. + */ +static void +shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data) +{ + dest.type = BRW_REGISTER_TYPE_F; + pln_data.type = BRW_REGISTER_TYPE_F; + const fs_reg dest_u = offset(dest, bld, 0); + const fs_reg dest_v = offset(dest, bld, 1); + + for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { + const fs_builder gbld = bld.group(8, g); + gbld.MOV(horiz_offset(dest_u, g * 8), + byte_offset(pln_data, (g * 2 + 0) * REG_SIZE)); + gbld.MOV(horiz_offset(dest_v, g * 8), + byte_offset(pln_data, (g * 2 + 1) * REG_SIZE)); + } +} + +static void +shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src) +{ + pln_data.type = BRW_REGISTER_TYPE_F; + src.type = BRW_REGISTER_TYPE_F; + const fs_reg src_u = offset(src, bld, 0); + const fs_reg src_v = offset(src, bld, 1); + + for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { + const fs_builder gbld = bld.group(8, g); + gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE), + horiz_offset(src_u, g * 8)); + gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE), + horiz_offset(src_v, g * 8)); + } +} + void fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3192,6 +3380,23 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, bld.MOV(dest, fetch_render_target_array_index(bld)); break; + case nir_intrinsic_is_helper_invocation: { + /* Unlike the regular gl_HelperInvocation, that is defined at dispatch, + * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into + * consideration demoted invocations. That information is stored in + * f0.1. + */ + dest.type = BRW_REGISTER_TYPE_UD; + + bld.MOV(dest, brw_imm_ud(0)); + + fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0)); + mov->predicate = BRW_PREDICATE_NORMAL; + mov->predicate_inverse = true; + mov->flag_subreg = 1; + break; + } + case nir_intrinsic_load_helper_invocation: case nir_intrinsic_load_sample_mask_in: case nir_intrinsic_load_sample_id: { @@ -3239,40 +3444,85 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_demote: case nir_intrinsic_discard: + case nir_intrinsic_demote_if: case nir_intrinsic_discard_if: { /* We track our discarded pixels in f0.1. By predicating on it, we can * update just the flag bits that aren't yet discarded. If there's no * condition, we emit a CMP of g0 != g0, so all currently executing * channels will get turned off. */ - fs_inst *cmp; - if (instr->intrinsic == nir_intrinsic_discard_if) { - cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), - brw_imm_d(0), BRW_CONDITIONAL_Z); + fs_inst *cmp = NULL; + if (instr->intrinsic == nir_intrinsic_demote_if || + instr->intrinsic == nir_intrinsic_discard_if) { + nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]); + + if (alu != NULL && + alu->op != nir_op_bcsel && + alu->op != nir_op_inot) { + /* Re-emit the instruction that generated the Boolean value, but + * do not store it. Since this instruction will be conditional, + * other instructions that want to use the real Boolean value may + * get garbage. This was a problem for piglit's fs-discard-exit-2 + * test. + * + * Ideally we'd detect that the instruction cannot have a + * conditional modifier before emitting the instructions. Alas, + * that is nigh impossible. Instead, we're going to assume the + * instruction (or last instruction) generated can have a + * conditional modifier. If it cannot, fallback to the old-style + * compare, and hope dead code elimination will clean up the + * extra instructions generated. + */ + nir_emit_alu(bld, alu, false); + + cmp = (fs_inst *) instructions.get_tail(); + if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) { + if (cmp->can_do_cmod()) + cmp->conditional_mod = BRW_CONDITIONAL_Z; + else + cmp = NULL; + } else { + /* The old sequence that would have been generated is, + * basically, bool_result == false. This is equivalent to + * !bool_result, so negate the old modifier. + */ + cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod); + } + } + + if (cmp == NULL) { + cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), + brw_imm_d(0), BRW_CONDITIONAL_Z); + } } else { fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); } + cmp->predicate = BRW_PREDICATE_NORMAL; cmp->flag_subreg = 1; if (devinfo->gen >= 6) { + /* Due to the way we implement discard, the jump will only happen + * when the whole quad is discarded. So we can do this even for + * demote as it won't break its uniformity promises. + */ emit_discard_jump(); } - limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode."); + limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode."); break; } case nir_intrinsic_load_input: { /* load_input is only used for flat inputs */ + assert(nir_dest_bit_size(instr->dest) == 32); unsigned base = nir_intrinsic_base(instr); unsigned comp = nir_intrinsic_component(instr); unsigned num_components = instr->num_components; - fs_reg orig_dest = dest; - enum brw_reg_type type = dest.type; /* Special case fields in the VUE header */ if (base == VARYING_SLOT_LAYER) @@ -3280,44 +3530,49 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, else if (base == VARYING_SLOT_VIEWPORT) comp = 2; - if (nir_dest_bit_size(instr->dest) == 64) { - /* const_index is in 32-bit type size units that could not be aligned - * with DF. We need to read the double vector as if it was a float - * vector of twice the number of components to fetch the right data. - */ - type = BRW_REGISTER_TYPE_F; - num_components *= 2; - dest = bld.vgrf(type, num_components); - } - for (unsigned int i = 0; i < num_components; i++) { - bld.MOV(offset(retype(dest, type), bld, i), - retype(component(interp_reg(base, comp + i), 3), type)); + bld.MOV(offset(dest, bld, i), + retype(component(interp_reg(base, comp + i), 3), dest.type)); } + break; + } - if (nir_dest_bit_size(instr->dest) == 64) { - shuffle_from_32bit_read(bld, orig_dest, dest, 0, - instr->num_components); - } + case nir_intrinsic_load_fs_input_interp_deltas: { + assert(stage == MESA_SHADER_FRAGMENT); + assert(nir_src_as_uint(instr->src[0]) == 0); + fs_reg interp = interp_reg(nir_intrinsic_base(instr), + nir_intrinsic_component(instr)); + dest.type = BRW_REGISTER_TYPE_F; + bld.MOV(offset(dest, bld, 0), component(interp, 3)); + bld.MOV(offset(dest, bld, 1), component(interp, 1)); + bld.MOV(offset(dest, bld, 2), component(interp, 0)); break; } case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - /* Do nothing - load_interpolated_input handling will handle it later. */ + case nir_intrinsic_load_barycentric_sample: { + /* Use the delta_xy values computed from the payload */ + const glsl_interp_mode interp_mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp_mode, instr->intrinsic); + + shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]); break; + } case nir_intrinsic_load_barycentric_at_sample: { const glsl_interp_mode interpolation = (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (nir_src_is_const(instr->src[0])) { unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ brw_imm_ud(msg_data), interpolation); @@ -3332,7 +3587,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ msg_data, interpolation); @@ -3360,7 +3615,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ component(msg_data, 0), interpolation); @@ -3372,6 +3627,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, bld.emit(BRW_OPCODE_WHILE)); } } + shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3381,6 +3637,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (const_offset) { assert(nir_src_bit_size(instr->src[0]) == 32); unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; @@ -3388,7 +3645,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - dest, + tmp, fs_reg(), /* src */ brw_imm_ud(off_x | (off_y << 4)), interpolation); @@ -3425,20 +3682,20 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; emit_pixel_interpolater_send(bld, opcode, - dest, + tmp, src, brw_imm_ud(0u), interpolation); } + shuffle_from_pln_layout(bld, dest, tmp); break; } - case nir_intrinsic_load_interpolated_input: { - if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { - emit_fragcoord_interpolation(dest); - break; - } + case nir_intrinsic_load_frag_coord: + emit_fragcoord_interpolation(dest); + break; + case nir_intrinsic_load_interpolated_input: { assert(instr->src[0].ssa && instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); nir_intrinsic_instr *bary_intrinsic = @@ -3450,8 +3707,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || bary_intrin == nir_intrinsic_load_barycentric_at_sample) { - /* Use the result of the PI message */ - dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); + /* Use the result of the PI message. Because the load_barycentric + * intrinsics return a regular vec2 and we need it in PLN layout, we + * have to do a translation. Fortunately, copy-prop cleans this up + * reliably. + */ + dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0])); } else { /* Use the delta_xy values computed from the payload */ enum brw_barycentric_mode bary = @@ -3484,20 +3746,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, } } -static int -get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) -{ - if (nir_src_is_const(instr->src[src])) { - int64_t add_val = nir_src_as_int(instr->src[src]); - if (add_val == 1) - return BRW_AOP_INC; - else if (add_val == -1) - return BRW_AOP_DEC; - } - - return BRW_AOP_ADD; -} - void fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3551,43 +3799,21 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, } case nir_intrinsic_shared_atomic_add: - nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_shared_atomic_imin: - nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_shared_atomic_umin: - nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_shared_atomic_imax: - nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_shared_atomic_umax: - nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_shared_atomic_and: - nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_shared_atomic_or: - nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_shared_atomic_xor: - nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_shared_atomic_exchange: - nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_shared_atomic_comp_swap: - nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_shared_atomic_fmin: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_shared_atomic_fmax: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_shared_atomic_fcomp_swap: - nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_shared: { @@ -3619,7 +3845,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); - bld.MOV(dest, read_result); + bld.MOV(dest, subscript(read_result, dest.type, 0)); } break; } @@ -3672,8 +3898,14 @@ brw_nir_reduction_op_identity(const fs_builder &bld, { nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); switch (type_sz(type)) { + case 1: + if (type == BRW_REGISTER_TYPE_UB) { + return brw_imm_uw(value.u8); + } else { + assert(type == BRW_REGISTER_TYPE_B); + return brw_imm_w(value.i8); + } case 2: - assert(type != BRW_REGISTER_TYPE_HF); return retype(brw_imm_uw(value.u16), type); case 4: return retype(brw_imm_ud(value.u32), type); @@ -3802,78 +4034,84 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_comp_swap: { + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: { if (stage == MESA_SHADER_FRAGMENT && instr->intrinsic != nir_intrinsic_image_load) brw_wm_prog_data(prog_data)->has_side_effects = true; /* Get some metadata from the image intrinsic. */ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; - const GLenum format = nir_intrinsic_format(instr); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; - srcs[SURFACE_LOGICAL_SRC_SURFACE] = - get_nir_image_intrinsic_image(bld, instr); + + switch (instr->intrinsic) { + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_image_intrinsic_image(bld, instr); + break; + + default: + /* Bindless */ + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = + bld.emit_uniformize(get_nir_src(instr->src[0])); + break; + } + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(image_intrinsic_coord_components(instr)); /* Emit an image load, store or atomic op. */ - if (instr->intrinsic == nir_intrinsic_image_load) { + if (instr->intrinsic == nir_intrinsic_image_load || + instr->intrinsic == nir_intrinsic_bindless_image_load) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); fs_inst *inst = bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); inst->size_written = instr->num_components * dispatch_width * 4; - } else if (instr->intrinsic == nir_intrinsic_image_store) { + } else if (instr->intrinsic == nir_intrinsic_image_store || + instr->intrinsic == nir_intrinsic_bindless_image_store) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - int op; unsigned num_srcs = info->num_srcs; - - switch (instr->intrinsic) { - case nir_intrinsic_image_atomic_add: + int op = brw_aop_for_nir_intrinsic(instr); + if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { assert(num_srcs == 4); - - op = get_op_for_atomic_add(instr, 3); - - if (op != BRW_AOP_ADD) - num_srcs = 3; - break; - case nir_intrinsic_image_atomic_min: - assert(format == GL_R32UI || format == GL_R32I); - op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN; - break; - case nir_intrinsic_image_atomic_max: - assert(format == GL_R32UI || format == GL_R32I); - op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX; - break; - case nir_intrinsic_image_atomic_and: - op = BRW_AOP_AND; - break; - case nir_intrinsic_image_atomic_or: - op = BRW_AOP_OR; - break; - case nir_intrinsic_image_atomic_xor: - op = BRW_AOP_XOR; - break; - case nir_intrinsic_image_atomic_exchange: - op = BRW_AOP_MOV; - break; - case nir_intrinsic_image_atomic_comp_swap: - op = BRW_AOP_CMPWR; - break; - default: - unreachable("Not reachable."); + num_srcs = 3; } srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); @@ -3895,16 +4133,22 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_image_size: { + case nir_intrinsic_image_size: + case nir_intrinsic_bindless_image_size: { /* Unlike the [un]typed load and store opcodes, the TXS that this turns * into will handle the binding table index for us in the geneerator. + * Incidentally, this means that we can handle bindless with exactly the + * same code. */ fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); image = bld.emit_uniformize(image); fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; - srcs[TEX_LOGICAL_SRC_SURFACE] = image; + if (instr->intrinsic == nir_intrinsic_image_size) + srcs[TEX_LOGICAL_SRC_SURFACE] = image; + else + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); @@ -3970,10 +4214,47 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier: { + bool l3_fence, slm_fence; + if (devinfo->gen >= 11) { + l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; + slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier_shared; + } else { + /* Prior to gen11, we only have one kind of fence. */ + l3_fence = true; + slm_fence = false; + } + + /* Be conservative in Gen11+ and always stall in a fence. Since there + * are two different fences, and shader might want to synchronize + * between them. + * + * TODO: Improve NIR so that scope and visibility information for the + * barriers is available here to make a better decision. + * + * TODO: When emitting more than one fence, it might help emit all + * the fences first and then generate the stall moves. + */ + const bool stall = devinfo->gen >= 11; + const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) - ->size_written = 2 * REG_SIZE; + + if (l3_fence) { + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(stall), + /* bti */ brw_imm_ud(0)) + ->size_written = 2 * REG_SIZE; + } + + if (slm_fence) { + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(stall), + brw_imm_ud(GEN7_BTI_SLM)) + ->size_written = 2 * REG_SIZE; + } + break; } @@ -4079,6 +4360,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr for (int i = 0; i < instr->num_components; i++) VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, base_offset, i * type_sz(dest.type)); + + prog_data->has_ubo_pull = true; } else { /* Even if we are loading doubles, a pull constant load will load * a 32-bit vec4, so should only reserve vgrf space for that. If we @@ -4118,6 +4401,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } } + prog_data->has_ubo_pull = true; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); @@ -4161,15 +4446,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const unsigned bit_size = nir_dest_bit_size(instr->dest); assert(bit_size <= 32); assert(nir_dest_num_components(instr->dest) == 1); - brw_reg_type data_type = - brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp, get_nir_src(instr->src[0]), /* Address */ fs_reg(), /* No source data */ brw_imm_ud(bit_size)); - bld.MOV(retype(dest, data_type), tmp); + bld.MOV(dest, subscript(tmp, dest.type, 0)); } break; } @@ -4204,43 +4487,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; case nir_intrinsic_global_atomic_add: - nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_global_atomic_imin: - nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_global_atomic_umin: - nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_global_atomic_imax: - nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_global_atomic_umax: - nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_global_atomic_and: - nir_emit_global_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_global_atomic_or: - nir_emit_global_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_global_atomic_xor: - nir_emit_global_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_global_atomic_exchange: - nir_emit_global_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_global_atomic_comp_swap: - nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_global_atomic_fmin: - nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_global_atomic_fmax: - nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_global_atomic_fcomp_swap: - nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_ssbo: { @@ -4272,7 +4533,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); - bld.MOV(dest, read_result); + bld.MOV(dest, subscript(read_result, dest.type, 0)); } break; } @@ -4317,15 +4578,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_store_output: { + assert(nir_src_bit_size(instr->src[0]) == 32); fs_reg src = get_nir_src(instr->src[0]); unsigned store_offset = nir_src_as_uint(instr->src[1]); unsigned num_components = instr->num_components; unsigned first_component = nir_intrinsic_component(instr); - if (nir_src_bit_size(instr->src[0]) == 64) { - src = shuffle_for_32bit_write(bld, src, 0, num_components); - num_components *= 2; - } fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 4 * store_offset), src.type); @@ -4337,43 +4595,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_ssbo_atomic_add: - nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); - break; case nir_intrinsic_ssbo_atomic_imin: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_ssbo_atomic_umin: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_ssbo_atomic_imax: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_ssbo_atomic_umax: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_ssbo_atomic_and: - nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_ssbo_atomic_or: - nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_ssbo_atomic_xor: - nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_ssbo_atomic_exchange: - nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_ssbo_atomic_comp_swap: - nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_ssbo_atomic_fmin: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_ssbo_atomic_fmax: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_ssbo_atomic_fcomp_swap: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_get_buffer_size: { @@ -4435,6 +4671,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_subgroup_size: + /* This should only happen for fragment shaders because every other case + * is lowered in NIR so we can optimize on it. + */ + assert(stage == MESA_SHADER_FRAGMENT); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width)); + break; + case nir_intrinsic_load_subgroup_invocation: bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); @@ -4516,7 +4760,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg value = get_nir_src(instr->src[0]); if (instr->intrinsic == nir_intrinsic_vote_feq) { const unsigned bit_size = nir_src_bit_size(instr->src[0]); - value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); + value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); } fs_reg uniformized = bld.emit_uniformize(value); @@ -4623,16 +4868,29 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_quad_swap_horizontal: { const fs_reg value = get_nir_src(instr->src[0]); const fs_reg tmp = bld.vgrf(value.type); - const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); + if (devinfo->gen <= 7) { + /* The hardware doesn't seem to support these crazy regions with + * compressed instructions on gen7 and earlier so we fall back to + * using quad swizzles. Fortunately, we don't support 64-bit + * anything in Vulkan on gen7. + */ + assert(nir_src_bit_size(instr->src[0]) == 32); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); + bld.MOV(retype(dest, value.type), tmp); + } else { + const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); - const fs_reg src_left = horiz_stride(value, 2); - const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); - const fs_reg tmp_left = horiz_stride(tmp, 2); - const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); + const fs_reg src_left = horiz_stride(value, 2); + const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); + const fs_reg tmp_left = horiz_stride(tmp, 2); + const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); - ubld.MOV(tmp_left, src_right); - ubld.MOV(tmp_right, src_left); + ubld.MOV(tmp_left, src_right); + ubld.MOV(tmp_right, src_left); + } bld.MOV(retype(dest, value.type), tmp); break; } @@ -4695,10 +4953,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); bld.emit_scan(brw_op, scan, cluster_size, cond_mod); @@ -4741,10 +5017,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); const fs_builder allbld = bld.exec_all(); allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); @@ -4753,7 +5047,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * shift of the contents before we can begin. To make things worse, * we can't do this with a normal stride; we have to use indirects. */ - fs_reg shifted = bld.vgrf(src.type); + fs_reg shifted = bld.vgrf(scan_type); fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], brw_imm_w(-1)); @@ -4772,14 +5066,26 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 * - REG_SIZE; - + ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) + ->size_written = 2 * REG_SIZE; break; } case nir_intrinsic_end_invocation_interlock: { - /* We don't need to do anything here */ + /* For endInvocationInterlock(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + */ + const fs_builder ubld = bld.group(8, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0)) + ->size_written = 2 * REG_SIZE; break; } @@ -4795,6 +5101,13 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, if (stage == MESA_SHADER_FRAGMENT) brw_wm_prog_data(prog_data)->has_side_effects = true; + /* The BTI untyped atomic messages only support 32-bit atomics. If you + * just look at the big table of messages in the Vol 7 of the SKL PRM, they + * appear to exist. However, if you look at Vol 2a, there are no message + * descriptors provided for Qword atomic ops except for A64 messages. + */ + assert(nir_dest_bit_size(instr->dest) == 32); + fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); @@ -4959,8 +5272,14 @@ fs_visitor::nir_emit_global_atomic(const fs_builder &bld, data = tmp; } - bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, - dest, addr, data, brw_imm_ud(op)); + if (nir_dest_bit_size(instr->dest) == 64) { + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, + dest, addr, data, brw_imm_ud(op)); + } else { + assert(nir_dest_bit_size(instr->dest) == 32); + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, + dest, addr, data, brw_imm_ud(op)); + } } void @@ -5091,6 +5410,18 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) break; } + case nir_tex_src_texture_handle: + assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); + srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); + break; + + case nir_tex_src_sampler_handle: + assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); + srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); + srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); + break; + case nir_tex_src_ms_mcs: assert(instr->op == nir_texop_txf_ms); srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); @@ -5120,7 +5451,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) srcs[TEX_LOGICAL_SRC_MCS] = emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], instr->coord_components, - srcs[TEX_LOGICAL_SRC_SURFACE]); + srcs[TEX_LOGICAL_SRC_SURFACE], + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); } else { srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); } @@ -5129,15 +5461,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); - bool shader_supports_implicit_lod = stage == MESA_SHADER_FRAGMENT || - (stage == MESA_SHADER_COMPUTE && - nir->info.cs.derivative_group != DERIVATIVE_GROUP_NONE); - enum opcode opcode; switch (instr->op) { case nir_texop_tex: - opcode = shader_supports_implicit_lod ? - SHADER_OPCODE_TEX_LOGICAL : SHADER_OPCODE_TXL_LOGICAL; + opcode = SHADER_OPCODE_TEX_LOGICAL; break; case nir_texop_txb: opcode = FS_OPCODE_TXB_LOGICAL; @@ -5390,28 +5717,6 @@ shuffle_from_32bit_read(const fs_builder &bld, shuffle_src_to_dst(bld, dst, src, first_component, components); } -fs_reg -shuffle_for_32bit_write(const fs_builder &bld, - const fs_reg &src, - uint32_t first_component, - uint32_t components) -{ - fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, - DIV_ROUND_UP (components * type_sz(src.type), 4)); - /* This function takes components in units of the source type while - * shuffle_src_to_dst takes components in units of the smallest type - */ - if (type_sz(src.type) > 4) { - assert(type_sz(src.type) == 8); - first_component *= 2; - components *= 2; - } - - shuffle_src_to_dst(bld, dst, src, first_component, components); - - return dst; -} - fs_reg setup_imm_df(const fs_builder &bld, double v) {