X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_nir.cpp;h=10fffa13054aae9267dd4ae93aaee9fe069c434d;hp=32d1e347b12a7b52300c3cedc8e7ce72313ecb14;hb=5799da47c794aced34187df2eee6fd349c51b931;hpb=e51eda99dfd6a66b066e371005e7a54ecc38fc11 diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 32d1e347b12..10fffa13054 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -59,7 +59,7 @@ fs_visitor::nir_setup_outputs() * allocating them. With ARB_enhanced_layouts, multiple output variables * may occupy the same slot, but have different type sizes. */ - nir_foreach_variable(var, &nir->outputs) { + nir_foreach_shader_out_variable(var, nir) { const int loc = var->data.driver_location; const unsigned var_vec4s = var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) @@ -100,12 +100,25 @@ fs_visitor::nir_setup_uniforms() uniforms = nir->num_uniforms / 4; - if (stage == MESA_SHADER_COMPUTE) { - /* Add a uniform for the thread local id. It must be the last uniform - * on the list. - */ + if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) { + /* Add uniforms for builtins after regular NIR uniforms. */ assert(uniforms == prog_data->nr_params); - uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); + + uint32_t *param; + if (nir->info.cs.local_size_variable && + compiler->lower_variable_group_size) { + param = brw_stage_prog_data_add_params(prog_data, 3); + for (unsigned i = 0; i < 3; i++) { + param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); + group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + } + } + + /* Subgroup ID must be the last uniform on the list. This will make + * easier later to split between cross thread and per thread + * uniforms. + */ + param = brw_stage_prog_data_add_params(prog_data, 1); *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); } @@ -171,7 +184,8 @@ emit_system_values_block(nir_block *block, fs_visitor *v) break; case nir_intrinsic_load_work_group_id: - assert(v->stage == MESA_SHADER_COMPUTE); + assert(v->stage == MESA_SHADER_COMPUTE || + v->stage == MESA_SHADER_KERNEL); reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; if (reg->file == BAD_FILE) *reg = *v->emit_cs_work_group_id_setup(); @@ -394,9 +408,6 @@ fs_visitor::nir_emit_if(nir_if *if_stmt) */ nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); if (cond != NULL && cond->op == nir_op_inot) { - assert(!cond->src[0].negate); - assert(!cond->src[0].abs); - invert = true; cond_reg = get_nir_src(cond->src[0].src); } else { @@ -479,6 +490,7 @@ fs_visitor::nir_emit_instr(nir_instr *instr) nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; default: @@ -532,15 +544,6 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) return false; - /* If either opcode has source modifiers, bail. - * - * TODO: We can potentially handle source modifiers if both of the opcodes - * we're combining are signed integers. - */ - if (instr->src[0].abs || instr->src[0].negate || - src0->src[0].abs || src0->src[0].negate) - return false; - unsigned element = nir_src_as_uint(src0->src[1].src); /* Element type to extract.*/ @@ -554,8 +557,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, nir_src_bit_size(src0->src[0].src))); op0 = offset(op0, bld, src0->src[0].swizzle[0]); - set_saturate(instr->dest.saturate, - bld.MOV(result, subscript(op0, type, element))); + bld.MOV(result, subscript(op0, type, element)); return true; } @@ -727,13 +729,17 @@ fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, (nir_alu_type)(nir_op_infos[instr->op].output_type | nir_dest_bit_size(instr->dest.dest))); + assert(!instr->dest.saturate); + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + /* We don't lower to source modifiers so they should not exist. */ + assert(!instr->src[i].abs); + assert(!instr->src[i].negate); + op[i] = get_nir_src(instr->src[i].src); op[i].type = brw_type_for_nir_type(devinfo, (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | nir_src_bit_size(instr->src[i].src))); - op[i].abs = instr->src[i].abs; - op[i].negate = instr->src[i].negate; } /* Move and vecN instrutions may still be vectored. Return the raw, @@ -746,6 +752,8 @@ fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: return result; default: break; @@ -781,8 +789,7 @@ fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, for (unsigned i = 0; i < 2; i++) { nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); - if (inot_instr != NULL && inot_instr->op == nir_op_inot && - !inot_instr->src[0].abs && !inot_instr->src[0].negate) { + if (inot_instr != NULL && inot_instr->op == nir_op_inot) { /* The source of the inot is now the source of instr. */ prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); @@ -851,8 +858,6 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, const nir_alu_instr *const fsign_instr = nir_src_as_alu_instr(instr->src[fsign_src].src); - assert(!fsign_instr->dest.saturate); - /* op[fsign_src] has the nominal result of the fsign, and op[1 - * fsign_src] has the other multiply source. This must be rearranged so * that op[0] is the source of the fsign op[1] is the other multiply @@ -868,8 +873,6 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, nir_src_bit_size(fsign_instr->src[0].src)); op[0].type = brw_type_for_nir_type(devinfo, t); - op[0].abs = fsign_instr->src[0].abs; - op[0].negate = fsign_instr->src[0].negate; unsigned channel = 0; if (nir_op_infos[instr->op].output_size == 0) { @@ -881,27 +884,9 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, } op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); - } else { - assert(!instr->dest.saturate); } - if (op[0].abs) { - /* Straightforward since the source can be assumed to be either strictly - * >= 0 or strictly <= 0 depending on the setting of the negate flag. - */ - set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); - - if (instr->op == nir_op_fsign) { - inst = (op[0].negate) - ? bld.MOV(result, brw_imm_f(-1.0f)) - : bld.MOV(result, brw_imm_f(1.0f)); - } else { - op[1].negate = (op[0].negate != op[1].negate); - inst = bld.MOV(result, op[1]); - } - - set_predicate(BRW_PREDICATE_NORMAL, inst); - } else if (type_sz(op[0].type) == 2) { + if (type_sz(op[0].type) == 2) { /* AND(val, 0x8000) gives the sign bit. * * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. @@ -1007,8 +992,7 @@ can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) * have already been taken (in nir_opt_algebraic) to ensure that. */ return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && - is_used_once(fsign_instr) && - !instr->src[fsign_src].abs && !instr->src[fsign_src].negate; + is_used_once(fsign_instr); } void @@ -1020,14 +1004,16 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, unsigned execution_mode = bld.shader->nir->info.float_controls_execution_mode; - fs_reg op[4]; + fs_reg op[NIR_MAX_VEC_COMPONENTS]; fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); switch (instr->op) { case nir_op_mov: case nir_op_vec2: case nir_op_vec3: - case nir_op_vec4: { + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: { fs_reg temp = result; bool need_extra_copy = false; for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { @@ -1044,13 +1030,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, continue; if (instr->op == nir_op_mov) { - inst = bld.MOV(offset(temp, bld, i), + bld.MOV(offset(temp, bld, i), offset(op[0], bld, instr->src[0].swizzle[i])); } else { - inst = bld.MOV(offset(temp, bld, i), + bld.MOV(offset(temp, bld, i), offset(op[i], bld, instr->src[i].swizzle[0])); } - inst->saturate = instr->dest.saturate; } /* In this case the source and destination registers were the same, @@ -1073,7 +1058,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, if (optimize_extract_to_float(instr, result)) return; inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_f2f16_rtne: @@ -1100,7 +1084,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, */ assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; } @@ -1148,7 +1131,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fsat: @@ -1160,8 +1142,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_ineg: op[0].negate = true; inst = bld.MOV(result, op[0]); - if (instr->op == nir_op_fneg) - inst->saturate = instr->dest.saturate; break; case nir_op_fabs: @@ -1169,8 +1149,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, op[0].negate = false; op[0].abs = true; inst = bld.MOV(result, op[0]); - if (instr->op == nir_op_fabs) - inst->saturate = instr->dest.saturate; break; case nir_op_f2f32: @@ -1185,7 +1163,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fsign: @@ -1194,27 +1171,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_frcp: inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fexp2: inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_flog2: inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fsin: inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fcos: inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddx: @@ -1223,15 +1195,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } else { inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); } - inst->saturate = instr->dest.saturate; break; case nir_op_fddx_fine: inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddx_coarse: inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddy: if (fs_key->high_quality_derivatives) { @@ -1239,15 +1208,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } else { inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); } - inst->saturate = instr->dest.saturate; break; case nir_op_fddy_fine: inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddy_coarse: inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fadd: @@ -1260,14 +1226,47 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, /* fallthrough */ case nir_op_iadd: inst = bld.ADD(result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; + case nir_op_iadd_sat: case nir_op_uadd_sat: inst = bld.ADD(result, op[0], op[1]); inst->saturate = true; break; + case nir_op_isub_sat: + bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); + break; + + case nir_op_usub_sat: + bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); + break; + + case nir_op_irhadd: + case nir_op_urhadd: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + inst = bld.AVG(result, op[0], op[1]); + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + fs_reg tmp = bld.vgrf(result.type); + + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + + /* AVG(x, y) - ((x ^ y) & 1) */ + bld.XOR(tmp, op[0], op[1]); + bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); + bld.AVG(result, op[0], op[1]); + inst = bld.ADD(result, result, tmp); + inst->src[1].negate = true; + break; + } + case nir_op_fmul: for (unsigned i = 0; i < 2; i++) { if (can_fuse_fmul_fsign(instr, i)) { @@ -1288,7 +1287,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } inst = bld.MUL(result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; case nir_op_imul_2x32_64: @@ -1296,6 +1294,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, bld.MUL(result, op[0], op[1]); break; + case nir_op_imul_32x16: + case nir_op_umul_32x16: { + const bool ud = instr->op == nir_op_umul_32x16; + + assert(nir_dest_bit_size(instr->dest.dest) == 32); + + /* Before Gen7, the order of the 32-bit source and the 16-bit source was + * swapped. The extension isn't enabled on those platforms, so don't + * pretend to support the differences. + */ + assert(devinfo->gen >= 7); + + if (op[1].file == IMM) + op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); + else { + const enum brw_reg_type word_type = + ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; + + op[1] = subscript(op[1], word_type, 0); + } + + const enum brw_reg_type dword_type = + ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + bld.MUL(result, retype(op[0], dword_type), op[1]); + break; + } + case nir_op_imul: assert(nir_dest_bit_size(instr->dest.dest) < 64); bld.MUL(result, op[0], op[1]); @@ -1363,7 +1389,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_flt32: case nir_op_fge32: case nir_op_feq32: - case nir_op_fne32: { + case nir_op_fneu32: { fs_reg dest = result; const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); @@ -1429,11 +1455,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, if (inot_src_instr != NULL && (inot_src_instr->op == nir_op_ior || inot_src_instr->op == nir_op_ixor || - inot_src_instr->op == nir_op_iand) && - !inot_src_instr->src[0].abs && - !inot_src_instr->src[0].negate && - !inot_src_instr->src[1].abs && - !inot_src_instr->src[1].negate) { + inot_src_instr->op == nir_op_iand)) { /* The sources of the source logical instruction are now the * sources of the instruction that will be generated. */ @@ -1522,35 +1544,15 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_b32any_inequal4: unreachable("Lowered by nir_lower_alu_reductions"); - case nir_op_fnoise1_1: - case nir_op_fnoise1_2: - case nir_op_fnoise1_3: - case nir_op_fnoise1_4: - case nir_op_fnoise2_1: - case nir_op_fnoise2_2: - case nir_op_fnoise2_3: - case nir_op_fnoise2_4: - case nir_op_fnoise3_1: - case nir_op_fnoise3_2: - case nir_op_fnoise3_3: - case nir_op_fnoise3_4: - case nir_op_fnoise4_1: - case nir_op_fnoise4_2: - case nir_op_fnoise4_3: - case nir_op_fnoise4_4: - unreachable("not reached: should be handled by lower_noise"); - case nir_op_ldexp: unreachable("not reached: should be handled by ldexp_to_arith()"); case nir_op_fsqrt: inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_frsq: inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_i2b32: @@ -1593,7 +1595,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_ftrunc: inst = bld.RNDZ(result, op[0]); - inst->saturate = instr->dest.saturate; + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } break; case nir_op_fceil: { @@ -1602,20 +1609,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, bld.RNDD(temp, op[0]); temp.negate = true; inst = bld.MOV(result, temp); - inst->saturate = instr->dest.saturate; break; } case nir_op_ffloor: inst = bld.RNDD(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_ffract: inst = bld.FRC(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fround_even: inst = bld.RNDE(result, op[0]); - inst->saturate = instr->dest.saturate; + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } break; case nir_op_fquantize2f16: { @@ -1642,7 +1651,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, /* Select that or zero based on normal status */ inst = bld.SEL(result, zero, tmp32); inst->predicate = BRW_PREDICATE_NORMAL; - inst->saturate = instr->dest.saturate; break; } @@ -1650,14 +1658,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_umin: case nir_op_fmin: inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); - inst->saturate = instr->dest.saturate; break; case nir_op_imax: case nir_op_umax: case nir_op_fmax: inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); - inst->saturate = instr->dest.saturate; break; case nir_op_pack_snorm_2x16: @@ -1678,7 +1684,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_unpack_half_2x16_split_x: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); - inst->saturate = instr->dest.saturate; break; case nir_op_unpack_half_2x16_split_y_flush_to_zero: @@ -1687,7 +1692,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_unpack_half_2x16_split_y: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); - inst->saturate = instr->dest.saturate; break; case nir_op_pack_64_2x32_split: @@ -1715,7 +1719,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_fpow: inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; case nir_op_bitfield_reverse: @@ -1734,6 +1737,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, break; } + case nir_op_uclz: + assert(nir_dest_bit_size(instr->dest.dest) == 32); + bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + break; + case nir_op_ifind_msb: { assert(nir_dest_bit_size(instr->dest.dest) < 64); @@ -1831,7 +1839,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } inst = bld.MAD(result, op[2], op[1], op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_flrp: @@ -1843,7 +1850,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } inst = bld.LRP(result, op[0], op[1], op[2]); - inst->saturate = instr->dest.saturate; break; case nir_op_b32csel: @@ -2751,7 +2757,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, brw_imm_d(tcs_key->input_vertices)); break; - case nir_intrinsic_barrier: { + case nir_intrinsic_control_barrier: { if (tcs_prog_data->instances == 1) break; @@ -3163,7 +3169,15 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, static fs_reg fetch_render_target_array_index(const fs_builder &bld) { - if (bld.shader->devinfo->gen >= 6) { + if (bld.shader->devinfo->gen >= 12) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r1.1. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), + brw_imm_uw(0x7ff)); + return idx; + } else if (bld.shader->devinfo->gen >= 6) { /* The render target array index is provided in the thread payload as * bits 26:16 of r0.0. */ @@ -3313,44 +3327,6 @@ alloc_frag_output(fs_visitor *v, unsigned location) unreachable("Invalid location"); } -/* Annoyingly, we get the barycentrics into the shader in a layout that's - * optimized for PLN but it doesn't work nearly as well as one would like for - * manual interpolation. - */ -static void -shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data) -{ - dest.type = BRW_REGISTER_TYPE_F; - pln_data.type = BRW_REGISTER_TYPE_F; - const fs_reg dest_u = offset(dest, bld, 0); - const fs_reg dest_v = offset(dest, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(horiz_offset(dest_u, g * 8), - byte_offset(pln_data, (g * 2 + 0) * REG_SIZE)); - gbld.MOV(horiz_offset(dest_v, g * 8), - byte_offset(pln_data, (g * 2 + 1) * REG_SIZE)); - } -} - -static void -shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src) -{ - pln_data.type = BRW_REGISTER_TYPE_F; - src.type = BRW_REGISTER_TYPE_F; - const fs_reg src_u = offset(src, bld, 0); - const fs_reg src_v = offset(src, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE), - horiz_offset(src_u, g * 8)); - gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE), - horiz_offset(src_v, g * 8)); - } -} - void fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3394,7 +3370,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0)); mov->predicate = BRW_PREDICATE_NORMAL; mov->predicate_inverse = true; - mov->flag_subreg = 1; + mov->flag_subreg = sample_mask_flag_subreg(this); break; } @@ -3449,9 +3425,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, case nir_intrinsic_discard: case nir_intrinsic_demote_if: case nir_intrinsic_discard_if: { - /* We track our discarded pixels in f0.1. By predicating on it, we can - * update just the flag bits that aren't yet discarded. If there's no - * condition, we emit a CMP of g0 != g0, so all currently executing + /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we + * can update just the flag bits that aren't yet discarded. If there's + * no condition, we emit a CMP of g0 != g0, so all currently executing * channels will get turned off. */ fs_inst *cmp = NULL; @@ -3461,10 +3437,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (alu != NULL && alu->op != nir_op_bcsel && - alu->op != nir_op_inot && (devinfo->gen > 5 || (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || - alu->op == nir_op_fne32 || alu->op == nir_op_feq32 || + alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 || alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || @@ -3511,17 +3486,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, } cmp->predicate = BRW_PREDICATE_NORMAL; - cmp->flag_subreg = 1; + cmp->flag_subreg = sample_mask_flag_subreg(this); - if (devinfo->gen >= 6) { - /* Due to the way we implement discard, the jump will only happen - * when the whole quad is discarded. So we can do this even for - * demote as it won't break its uniformity promises. - */ - emit_discard_jump(); - } + emit_discard_jump(); - limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode."); + if (devinfo->gen < 7) + limit_dispatch_width( + 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); break; } @@ -3565,8 +3536,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, instr->intrinsic); - - shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]); + const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), + offset(this->delta_xy[bary], bld, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); break; } @@ -3574,13 +3546,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const glsl_interp_mode interpolation = (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (nir_src_is_const(instr->src[0])) { unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(msg_data), interpolation); @@ -3595,9 +3566,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ - msg_data, + component(msg_data, 0), interpolation); } else { /* Make a loop that sends a message to the pixel interpolater @@ -3623,7 +3594,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ component(msg_data, 0), interpolation); @@ -3635,7 +3606,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, bld.emit(BRW_OPCODE_WHILE)); } } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3645,7 +3615,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (const_offset) { assert(nir_src_bit_size(instr->src[0]) == 32); unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; @@ -3653,7 +3622,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(off_x | (off_y << 4)), interpolation); @@ -3690,12 +3659,11 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; emit_pixel_interpolater_send(bld, opcode, - tmp, + dest, src, brw_imm_ud(0u), interpolation); } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3715,18 +3683,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || bary_intrin == nir_intrinsic_load_barycentric_at_sample) { - /* Use the result of the PI message. Because the load_barycentric - * intrinsics return a regular vec2 and we need it in PLN layout, we - * have to do a translation. Fortunately, copy-prop cleans this up - * reliably. - */ - dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0])); + /* Use the result of the PI message. */ + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); } else { /* Use the delta_xy values computed from the payload */ enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, bary_intrin); - dst_xy = this->delta_xy[bary]; } @@ -3758,7 +3720,7 @@ void fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); fs_reg dest; @@ -3766,7 +3728,17 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, dest = get_nir_dest(instr->dest); switch (instr->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: + /* The whole workgroup fits in a single HW thread, so all the + * invocations are already executed lock-step. Instead of an actual + * barrier just emit a scheduling fence, that will generate no code. + */ + if (!nir->info.cs.local_size_variable && + workgroup_size() <= dispatch_width) { + bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); + break; + } + emit_barrier(); cs_prog_data->uses_barrier = true; break; @@ -3795,14 +3767,12 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); - srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */ - - /* Read the 3 GLuint components of gl_NumWorkGroups */ - for (unsigned i = 0; i < 3; i++) { - srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); + fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, - offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS); - } + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = 3 * dispatch_width * 4; break; } @@ -3826,7 +3796,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, case nir_intrinsic_load_shared: { assert(devinfo->gen >= 7); - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); const unsigned bit_size = nir_dest_bit_size(instr->dest); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -3838,15 +3808,17 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); /* Read the vector */ - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); inst->size_written = instr->num_components * dispatch_width * 4; } else { - assert(nir_dest_bit_size(instr->dest) <= 32); assert(nir_dest_num_components(instr->dest) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -3860,7 +3832,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, case nir_intrinsic_store_shared: { assert(devinfo->gen >= 7); - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); const unsigned bit_size = nir_src_bit_size(instr->src[0]); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -3871,17 +3843,18 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_intrinsic_write_mask(instr) == (1u << instr->num_components) - 1); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { assert(nir_src_num_components(instr->src[0]) <= 4); srcs[SURFACE_LOGICAL_SRC_DATA] = data; srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_src_num_components(instr->src[0]) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -3894,6 +3867,16 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_local_group_size: { + assert(compiler->lower_variable_group_size); + assert(nir->info.cs.local_size_variable); + for (unsigned i = 0; i < 3; i++) { + bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), + group_size[i]); + } + break; + } + default: nir_emit_intrinsic(bld, instr); break; @@ -3976,17 +3959,20 @@ fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, nir_intrinsic_instr *instr) { fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); + fs_reg surf_index = image; if (stage_prog_data->binding_table.image_start > 0) { if (image.file == BRW_IMMEDIATE_VALUE) { - image.d += stage_prog_data->binding_table.image_start; + surf_index = + brw_imm_ud(image.d + stage_prog_data->binding_table.image_start); } else { - bld.ADD(image, image, + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, image, brw_imm_d(stage_prog_data->binding_table.image_start)); } } - return bld.emit_uniformize(image); + return bld.emit_uniformize(surf_index); } fs_reg @@ -4010,27 +3996,6 @@ fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, return bld.emit_uniformize(surf_index); } -static unsigned -image_intrinsic_coord_components(nir_intrinsic_instr *instr) -{ - switch (nir_intrinsic_image_dim(instr)) { - case GLSL_SAMPLER_DIM_1D: - return 1 + nir_intrinsic_image_array(instr); - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - return 2 + nir_intrinsic_image_array(instr); - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - return 3; - case GLSL_SAMPLER_DIM_BUF: - return 1; - case GLSL_SAMPLER_DIM_MS: - return 2 + nir_intrinsic_image_array(instr); - default: - unreachable("Invalid image dimension"); - } -} - /** * The offsets we get from NIR act as if each SIMD channel has it's own blob * of contiguous space. However, if we actually place each SIMD channel in @@ -4118,10 +4083,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_bindless_image_atomic_xor: case nir_intrinsic_bindless_image_atomic_exchange: case nir_intrinsic_bindless_image_atomic_comp_swap: { - if (stage == MESA_SHADER_FRAGMENT && - instr->intrinsic != nir_intrinsic_image_load) - brw_wm_prog_data(prog_data)->has_side_effects = true; - /* Get some metadata from the image intrinsic. */ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; @@ -4153,7 +4114,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = - brw_imm_ud(image_intrinsic_coord_components(instr)); + brw_imm_ud(nir_image_intrinsic_coord_components(instr)); /* Emit an image load, store or atomic op. */ if (instr->intrinsic == nir_intrinsic_image_load || @@ -4207,6 +4168,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr BRW_REGISTER_TYPE_UD); image = bld.emit_uniformize(image); + assert(nir_src_as_uint(instr->src[1]) == 0); + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; if (instr->intrinsic == nir_intrinsic_image_size) srcs[TEX_LOGICAL_SRC_SURFACE] = image; @@ -4255,9 +4218,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_image_store_raw_intel: { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_image_intrinsic_image(bld, instr); @@ -4271,67 +4231,155 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_scoped_memory_barrier: + case nir_intrinsic_scoped_barrier: + assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); + /* Fall through. */ case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier: { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: { bool l3_fence, slm_fence; - if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) { + const enum opcode opcode = + instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? + SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; + + switch (instr->intrinsic) { + case nir_intrinsic_scoped_barrier: { nir_variable_mode modes = nir_intrinsic_memory_modes(instr); l3_fence = modes & (nir_var_shader_out | nir_var_mem_ssbo | nir_var_mem_global); - /* Prior to gen11, we only have one kind of fence. */ - slm_fence = devinfo->gen >= 11 && (modes & nir_var_mem_shared); - l3_fence |= devinfo->gen < 11 && (modes & nir_var_mem_shared); - } else { - if (devinfo->gen >= 11) { - l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; - slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || - instr->intrinsic == nir_intrinsic_memory_barrier || - instr->intrinsic == nir_intrinsic_memory_barrier_shared; - } else { - /* Prior to gen11, we only have one kind of fence. */ - l3_fence = true; - slm_fence = false; - } + slm_fence = modes & nir_var_mem_shared; + break; } + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: + /* For beginInvocationInterlockARB(), we will generate a memory fence + * but with a different opcode so that generator can pick SENDC + * instead of SEND. + * + * For endInvocationInterlockARB(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + * + * Handling them here will allow the logic for IVB render cache (see + * below) to be reused. + */ + l3_fence = true; + slm_fence = false; + break; + + default: + l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; + slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier_shared; + break; + } + + if (stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL) + slm_fence = false; + + /* If the workgroup fits in a single HW thread, the messages for SLM are + * processed in-order and the shader itself is already synchronized so + * the memory fence is not necessary. + * + * TODO: Check if applies for many HW threads sharing same Data Port. + */ + if (!nir->info.cs.local_size_variable && + slm_fence && workgroup_size() <= dispatch_width) + slm_fence = false; + + /* Prior to Gen11, there's only L3 fence, so emit that instead. */ + if (slm_fence && devinfo->gen < 11) { + slm_fence = false; + l3_fence = true; + } + + /* IVB does typed surface access through the render cache, so we need + * to flush it too. + */ + const bool needs_render_fence = + devinfo->gen == 7 && !devinfo->is_haswell; + /* Be conservative in Gen11+ and always stall in a fence. Since there * are two different fences, and shader might want to synchronize * between them. * - * TODO: Improve NIR so that scope and visibility information for the - * barriers is available here to make a better decision. - * - * TODO: When emitting more than one fence, it might help emit all - * the fences first and then generate the stall moves. + * TODO: Use scope and visibility information for the barriers from NIR + * to make a better decision on whether we need to stall. */ - const bool stall = devinfo->gen >= 11; + const bool stall = devinfo->gen >= 11 || needs_render_fence || + instr->intrinsic == nir_intrinsic_end_invocation_interlock; + + const bool commit_enable = stall || + devinfo->gen >= 10; /* HSD ES # 1404612949 */ + + unsigned fence_regs_count = 0; + fs_reg fence_regs[2] = {}; const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); if (l3_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - /* bti */ brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + fence_regs[fence_regs_count++] = fence->dst; + + if (needs_render_fence) { + fs_inst *render_fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; + + fence_regs[fence_regs_count++] = render_fence->dst; + } } if (slm_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - brw_imm_ud(GEN7_BTI_SLM)) - ->size_written = 2 * REG_SIZE; + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(GEN7_BTI_SLM)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + fence_regs[fence_regs_count++] = fence->dst; + } + + assert(fence_regs_count <= 2); + + if (stall || fence_regs_count == 0) { + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence_regs, fence_regs_count); } break; } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_shader_clock: { /* We cannot do anything if there is an event, so ignore it for now */ const fs_reg shader_clock = get_timestamp(bld); @@ -4346,6 +4394,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); break; + case nir_intrinsic_load_reloc_const_intel: { + uint32_t id = nir_intrinsic_param_idx(instr); + bld.emit(SHADER_OPCODE_MOV_RELOC_IMM, + dest, brw_imm_ud(id)); + break; + } + case nir_intrinsic_load_uniform: { /* Offsets are in bytes but they should always aligned to * the type size @@ -4504,11 +4559,15 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_load_global: { + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: { assert(devinfo->gen >= 8); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest, get_nir_src(instr->src[0]), /* Address */ @@ -4518,7 +4577,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr inst->dst.component_size(inst->exec_size); } else { const unsigned bit_size = nir_dest_bit_size(instr->dest); - assert(bit_size <= 32); assert(nir_dest_num_components(instr->dest) == 1); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, @@ -4534,20 +4592,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_store_global: assert(devinfo->gen >= 8); - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_src_bit_size(instr->src[0]) <= 32); + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_src_num_components(instr->src[0]) <= 4); bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(), get_nir_src(instr->src[1]), /* Address */ get_nir_src(instr->src[0]), /* Data */ brw_imm_ud(instr->num_components)); } else { - const unsigned bit_size = nir_src_bit_size(instr->src[0]); - assert(bit_size <= 32); assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); brw_reg_type data_type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); @@ -4592,15 +4651,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); /* Read the vector */ - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); inst->size_written = instr->num_components * dispatch_width * 4; } else { - assert(nir_dest_bit_size(instr->dest) <= 32); assert(nir_dest_num_components(instr->dest) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -4615,9 +4676,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_store_ssbo: { assert(devinfo->gen >= 7); - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - const unsigned bit_size = nir_src_bit_size(instr->src[0]); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; srcs[SURFACE_LOGICAL_SRC_SURFACE] = @@ -4628,17 +4686,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_intrinsic_write_mask(instr) == (1u << instr->num_components) - 1); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { assert(nir_src_num_components(instr->src[0]) <= 4); srcs[SURFACE_LOGICAL_SRC_DATA] = data; srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_src_num_components(instr->src[0]) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -4767,9 +4826,11 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); /* Read the vector */ - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); - + assert(nir_dest_num_components(instr->dest) == 1); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) >= 4 && + nir_intrinsic_align(instr) >= 4) { /* The offset for a DWORD scattered message is in dwords. */ srcs[SURFACE_LOGICAL_SRC_ADDRESS] = swizzle_nir_scratch_addr(bld, nir_addr, true); @@ -4777,8 +4838,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_dest_bit_size(instr->dest) <= 32); - srcs[SURFACE_LOGICAL_SRC_ADDRESS] = swizzle_nir_scratch_addr(bld, nir_addr, false); @@ -4811,10 +4870,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); - assert(nir_intrinsic_write_mask(instr) == - (1u << instr->num_components) - 1); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_src_num_components(instr->src[0]) == 1); + assert(nir_src_bit_size(instr->src[0]) <= 32); + assert(nir_intrinsic_write_mask(instr) == 1); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { srcs[SURFACE_LOGICAL_SRC_DATA] = data; /* The offset for a DWORD scattered message is in dwords. */ @@ -4824,8 +4885,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_src_bit_size(instr->src[0]) <= 32); - srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); @@ -5229,33 +5288,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_begin_invocation_interlock: { - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - - ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) - ->size_written = 2 * REG_SIZE; - break; - } - - case nir_intrinsic_end_invocation_interlock: { - /* For endInvocationInterlock(), we need to insert a memory fence which - * stalls in the shader until the memory transactions prior to that - * fence are complete. This ensures that the shader does not end before - * any writes from its critical section have landed. Otherwise, you can - * end up with a case where the next invocation on that pixel properly - * stalls for previous FS invocation on its pixel to complete but - * doesn't actually wait for the dataport memory transactions from that - * thread to land before submitting its own. - */ - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; - break; - } - default: unreachable("unknown intrinsic"); } @@ -5265,9 +5297,6 @@ void fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - /* The BTI untyped atomic messages only support 32-bit atomics. If you * just look at the big table of messages in the Vol 7 of the SKL PRM, they * appear to exist. However, if you look at Vol 2a, there are no message @@ -5307,9 +5336,6 @@ void fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); @@ -5419,9 +5445,6 @@ void fs_visitor::nir_emit_global_atomic(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); @@ -5453,9 +5476,6 @@ void fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - assert(nir_intrinsic_infos[instr->intrinsic].has_dest); fs_reg dest = get_nir_dest(instr->dest); @@ -5471,7 +5491,7 @@ fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, data = tmp; } - bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL, dest, addr, data, brw_imm_ud(op)); }