X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_nir.cpp;h=f10525741f28237ab1f7c28d1f76ec3bec4aa7c9;hp=8980163401a0cc53dfe4fc482b569901796c3c41;hb=003b04e266ae0faad563c1228561b53f33a68474;hpb=951cf94521a710fa2fa70329ff77934ada45bb70 diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8980163401a..f10525741f2 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -34,12 +34,15 @@ using namespace brw; void fs_visitor::emit_nir_code() { + emit_shader_float_controls_execution_mode(); + /* emit the arrays used for inputs and outputs - load/store intrinsics will * be converted to reads/writes of these arrays */ nir_setup_outputs(); nir_setup_uniforms(); nir_emit_system_values(); + last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); } @@ -56,7 +59,7 @@ fs_visitor::nir_setup_outputs() * allocating them. With ARB_enhanced_layouts, multiple output variables * may occupy the same slot, but have different type sizes. */ - nir_foreach_variable(var, &nir->outputs) { + nir_foreach_shader_out_variable(var, nir) { const int loc = var->data.driver_location; const unsigned var_vec4s = var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) @@ -97,12 +100,25 @@ fs_visitor::nir_setup_uniforms() uniforms = nir->num_uniforms / 4; - if (stage == MESA_SHADER_COMPUTE) { - /* Add a uniform for the thread local id. It must be the last uniform - * on the list. - */ + if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) { + /* Add uniforms for builtins after regular NIR uniforms. */ assert(uniforms == prog_data->nr_params); - uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); + + uint32_t *param; + if (nir->info.cs.local_size_variable && + compiler->lower_variable_group_size) { + param = brw_stage_prog_data_add_params(prog_data, 3); + for (unsigned i = 0; i < 3; i++) { + param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); + group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + } + } + + /* Subgroup ID must be the last uniform on the list. This will make + * easier later to split between cross thread and per thread + * uniforms. + */ + param = brw_stage_prog_data_add_params(prog_data, 1); *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); } @@ -168,7 +184,8 @@ emit_system_values_block(nir_block *block, fs_visitor *v) break; case nir_intrinsic_load_work_group_id: - assert(v->stage == MESA_SHADER_COMPUTE); + assert(v->stage == MESA_SHADER_COMPUTE || + v->stage == MESA_SHADER_KERNEL); reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; if (reg->file == BAD_FILE) *reg = *v->emit_cs_work_group_id_setup(); @@ -391,9 +408,6 @@ fs_visitor::nir_emit_if(nir_if *if_stmt) */ nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); if (cond != NULL && cond->op == nir_op_inot) { - assert(!cond->src[0].negate); - assert(!cond->src[0].abs); - invert = true; cond_reg = get_nir_src(cond->src[0].src); } else { @@ -476,6 +490,7 @@ fs_visitor::nir_emit_instr(nir_instr *instr) nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; default: @@ -529,15 +544,6 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) return false; - /* If either opcode has source modifiers, bail. - * - * TODO: We can potentially handle source modifiers if both of the opcodes - * we're combining are signed integers. - */ - if (instr->src[0].abs || instr->src[0].negate || - src0->src[0].abs || src0->src[0].negate) - return false; - unsigned element = nir_src_as_uint(src0->src[1].src); /* Element type to extract.*/ @@ -551,8 +557,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, nir_src_bit_size(src0->src[0].src))); op0 = offset(op0, bld, src0->src[0].swizzle[0]); - set_saturate(instr->dest.saturate, - bld.MOV(result, subscript(op0, type, element))); + bld.MOV(result, subscript(op0, type, element)); return true; } @@ -578,7 +583,24 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, fs_reg tmp = vgrf(glsl_type::int_type); - if (devinfo->gen >= 6) { + if (devinfo->gen >= 12) { + /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate the result for (gl_FrontFacing ? -1.0 : 1.0). + */ + bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g1, brw_imm_uw(0x3f80)); + + if (value1 == -1.0f) + bld.MOV(tmp, negate(tmp)); + + } else if (devinfo->gen >= 6) { /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); @@ -684,6 +706,16 @@ brw_rnd_mode_from_nir_op (const nir_op op) { } } +static brw_rnd_mode +brw_rnd_mode_from_execution_mode(unsigned execution_mode) +{ + if (nir_has_any_rounding_mode_rtne(execution_mode)) + return BRW_RND_MODE_RTNE; + if (nir_has_any_rounding_mode_rtz(execution_mode)) + return BRW_RND_MODE_RTZ; + return BRW_RND_MODE_UNSPECIFIED; +} + fs_reg fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, nir_alu_instr *instr, @@ -697,13 +729,17 @@ fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, (nir_alu_type)(nir_op_infos[instr->op].output_type | nir_dest_bit_size(instr->dest.dest))); + assert(!instr->dest.saturate); + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + /* We don't lower to source modifiers so they should not exist. */ + assert(!instr->src[i].abs); + assert(!instr->src[i].negate); + op[i] = get_nir_src(instr->src[i].src); op[i].type = brw_type_for_nir_type(devinfo, (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | nir_src_bit_size(instr->src[i].src))); - op[i].abs = instr->src[i].abs; - op[i].negate = instr->src[i].negate; } /* Move and vecN instrutions may still be vectored. Return the raw, @@ -751,8 +787,7 @@ fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, for (unsigned i = 0; i < 2; i++) { nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); - if (inot_instr != NULL && inot_instr->op == nir_op_inot && - !inot_instr->src[0].abs && !inot_instr->src[0].negate) { + if (inot_instr != NULL && inot_instr->op == nir_op_inot) { /* The source of the inot is now the source of instr. */ prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); @@ -821,8 +856,6 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, const nir_alu_instr *const fsign_instr = nir_src_as_alu_instr(instr->src[fsign_src].src); - assert(!fsign_instr->dest.saturate); - /* op[fsign_src] has the nominal result of the fsign, and op[1 - * fsign_src] has the other multiply source. This must be rearranged so * that op[0] is the source of the fsign op[1] is the other multiply @@ -838,8 +871,6 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, nir_src_bit_size(fsign_instr->src[0].src)); op[0].type = brw_type_for_nir_type(devinfo, t); - op[0].abs = fsign_instr->src[0].abs; - op[0].negate = fsign_instr->src[0].negate; unsigned channel = 0; if (nir_op_infos[instr->op].output_size == 0) { @@ -851,27 +882,9 @@ fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, } op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); - } else { - assert(!instr->dest.saturate); } - if (op[0].abs) { - /* Straightforward since the source can be assumed to be either strictly - * >= 0 or strictly <= 0 depending on the setting of the negate flag. - */ - set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); - - if (instr->op == nir_op_fsign) { - inst = (op[0].negate) - ? bld.MOV(result, brw_imm_f(-1.0f)) - : bld.MOV(result, brw_imm_f(1.0f)); - } else { - op[1].negate = (op[0].negate != op[1].negate); - inst = bld.MOV(result, op[1]); - } - - set_predicate(BRW_PREDICATE_NORMAL, inst); - } else if (type_sz(op[0].type) == 2) { + if (type_sz(op[0].type) == 2) { /* AND(val, 0x8000) gives the sign bit. * * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. @@ -977,8 +990,7 @@ can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) * have already been taken (in nir_opt_algebraic) to ensure that. */ return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && - is_used_once(fsign_instr) && - !instr->src[fsign_src].abs && !instr->src[fsign_src].negate; + is_used_once(fsign_instr); } void @@ -987,6 +999,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, { struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; fs_inst *inst; + unsigned execution_mode = + bld.shader->nir->info.float_controls_execution_mode; fs_reg op[4]; fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); @@ -1018,7 +1032,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, inst = bld.MOV(offset(temp, bld, i), offset(op[i], bld, instr->src[i].swizzle[0])); } - inst->saturate = instr->dest.saturate; } /* In this case the source and destination registers were the same, @@ -1041,15 +1054,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, if (optimize_extract_to_float(instr, result)) return; inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_f2f16_rtne: case nir_op_f2f16_rtz: - bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), - brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); - /* fallthrough */ - case nir_op_f2f16: + case nir_op_f2f16: { + brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; + + if (nir_op_f2f16 == instr->op) + rnd = brw_rnd_mode_from_execution_mode(execution_mode); + else + rnd = brw_rnd_mode_from_nir_op(instr->op); + + if (BRW_RND_MODE_UNSPECIFIED != rnd) + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); + /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending * on the HW gen, it is a special hw opcode or just a MOV, and * brw_F32TO16 (at brw_eu_emit) would do the work to chose. @@ -1061,8 +1080,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, */ assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; + } case nir_op_b2i8: case nir_op_b2i16: @@ -1085,7 +1104,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_f2u64: case nir_op_i2i32: case nir_op_u2u32: - case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: case nir_op_i2f16: @@ -1109,7 +1127,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fsat: @@ -1121,8 +1138,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_ineg: op[0].negate = true; inst = bld.MOV(result, op[0]); - if (instr->op == nir_op_fneg) - inst->saturate = instr->dest.saturate; break; case nir_op_fabs: @@ -1130,8 +1145,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, op[0].negate = false; op[0].abs = true; inst = bld.MOV(result, op[0]); - if (instr->op == nir_op_fabs) - inst->saturate = instr->dest.saturate; + break; + + case nir_op_f2f32: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + if (op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + + inst = bld.MOV(result, op[0]); break; case nir_op_fsign: @@ -1140,27 +1167,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_frcp: inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fexp2: inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_flog2: inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fsin: inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fcos: inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddx: @@ -1169,15 +1191,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } else { inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); } - inst->saturate = instr->dest.saturate; break; case nir_op_fddx_fine: inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddx_coarse: inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddy: if (fs_key->high_quality_derivatives) { @@ -1185,28 +1204,65 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } else { inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); } - inst->saturate = instr->dest.saturate; break; case nir_op_fddy_fine: inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fddy_coarse: inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); - inst->saturate = instr->dest.saturate; break; - case nir_op_iadd: case nir_op_fadd: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + /* fallthrough */ + case nir_op_iadd: inst = bld.ADD(result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; + case nir_op_iadd_sat: case nir_op_uadd_sat: inst = bld.ADD(result, op[0], op[1]); inst->saturate = true; break; + case nir_op_isub_sat: + bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); + break; + + case nir_op_usub_sat: + bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); + break; + + case nir_op_irhadd: + case nir_op_urhadd: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + inst = bld.AVG(result, op[0], op[1]); + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + fs_reg tmp = bld.vgrf(result.type); + + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + + /* AVG(x, y) - ((x ^ y) & 1) */ + bld.XOR(tmp, op[0], op[1]); + bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); + bld.AVG(result, op[0], op[1]); + inst = bld.ADD(result, result, tmp); + inst->src[1].negate = true; + break; + } + case nir_op_fmul: for (unsigned i = 0; i < 2; i++) { if (can_fuse_fmul_fsign(instr, i)) { @@ -1215,8 +1271,18 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, } } + /* We emit the rounding mode after the previous fsign optimization since + * it won't result in a MUL, but will try to negate the value by other + * means. + */ + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MUL(result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; case nir_op_imul_2x32_64: @@ -1224,6 +1290,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, bld.MUL(result, op[0], op[1]); break; + case nir_op_imul_32x16: + case nir_op_umul_32x16: { + const bool ud = instr->op == nir_op_umul_32x16; + + assert(nir_dest_bit_size(instr->dest.dest) == 32); + + /* Before Gen7, the order of the 32-bit source and the 16-bit source was + * swapped. The extension isn't enabled on those platforms, so don't + * pretend to support the differences. + */ + assert(devinfo->gen >= 7); + + if (op[1].file == IMM) + op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); + else { + const enum brw_reg_type word_type = + ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; + + op[1] = subscript(op[1], word_type, 0); + } + + const enum brw_reg_type dword_type = + ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + bld.MUL(result, retype(op[0], dword_type), op[1]); + break; + } + case nir_op_imul: assert(nir_dest_bit_size(instr->dest.dest) < 64); bld.MUL(result, op[0], op[1]); @@ -1329,7 +1423,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, temp_op[0] = bld.fix_byte_src(op[0]); temp_op[1] = bld.fix_byte_src(op[1]); - const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + const uint32_t bit_size = type_sz(temp_op[0].type) * 8; if (bit_size != 32) dest = bld.vgrf(temp_op[0].type, 1); @@ -1357,11 +1451,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, if (inot_src_instr != NULL && (inot_src_instr->op == nir_op_ior || inot_src_instr->op == nir_op_ixor || - inot_src_instr->op == nir_op_iand) && - !inot_src_instr->src[0].abs && - !inot_src_instr->src[0].negate && - !inot_src_instr->src[1].abs && - !inot_src_instr->src[1].negate) { + inot_src_instr->op == nir_op_iand)) { /* The sources of the source logical instruction are now the * sources of the instruction that will be generated. */ @@ -1450,35 +1540,15 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_b32any_inequal4: unreachable("Lowered by nir_lower_alu_reductions"); - case nir_op_fnoise1_1: - case nir_op_fnoise1_2: - case nir_op_fnoise1_3: - case nir_op_fnoise1_4: - case nir_op_fnoise2_1: - case nir_op_fnoise2_2: - case nir_op_fnoise2_3: - case nir_op_fnoise2_4: - case nir_op_fnoise3_1: - case nir_op_fnoise3_2: - case nir_op_fnoise3_3: - case nir_op_fnoise3_4: - case nir_op_fnoise4_1: - case nir_op_fnoise4_2: - case nir_op_fnoise4_3: - case nir_op_fnoise4_4: - unreachable("not reached: should be handled by lower_noise"); - case nir_op_ldexp: unreachable("not reached: should be handled by ldexp_to_arith()"); case nir_op_fsqrt: inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_frsq: inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_i2b32: @@ -1521,7 +1591,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_ftrunc: inst = bld.RNDZ(result, op[0]); - inst->saturate = instr->dest.saturate; + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } break; case nir_op_fceil: { @@ -1530,20 +1605,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, bld.RNDD(temp, op[0]); temp.negate = true; inst = bld.MOV(result, temp); - inst->saturate = instr->dest.saturate; break; } case nir_op_ffloor: inst = bld.RNDD(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_ffract: inst = bld.FRC(result, op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_fround_even: inst = bld.RNDE(result, op[0]); - inst->saturate = instr->dest.saturate; + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } break; case nir_op_fquantize2f16: { @@ -1570,7 +1647,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, /* Select that or zero based on normal status */ inst = bld.SEL(result, zero, tmp32); inst->predicate = BRW_PREDICATE_NORMAL; - inst->saturate = instr->dest.saturate; break; } @@ -1578,14 +1654,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_umin: case nir_op_fmin: inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); - inst->saturate = instr->dest.saturate; break; case nir_op_imax: case nir_op_umax: case nir_op_fmax: inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); - inst->saturate = instr->dest.saturate; break; case nir_op_pack_snorm_2x16: @@ -1600,15 +1674,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_pack_half_2x16: unreachable("not reached: should be handled by lower_packing_builtins"); + case nir_op_unpack_half_2x16_split_x_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_x: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); - inst->saturate = instr->dest.saturate; break; + + case nir_op_unpack_half_2x16_split_y_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_y: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); - inst->saturate = instr->dest.saturate; break; case nir_op_pack_64_2x32_split: @@ -1636,7 +1715,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_fpow: inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); - inst->saturate = instr->dest.saturate; break; case nir_op_bitfield_reverse: @@ -1655,6 +1733,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, break; } + case nir_op_uclz: + assert(nir_dest_bit_size(instr->dest.dest) == 32); + bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + break; + case nir_op_ifind_msb: { assert(nir_dest_bit_size(instr->dest.dest) < 64); @@ -1744,13 +1827,25 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, break; case nir_op_ffma: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MAD(result, op[2], op[1], op[0]); - inst->saturate = instr->dest.saturate; break; case nir_op_flrp: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.LRP(result, op[0], op[1], op[2]); - inst->saturate = instr->dest.saturate; break; case nir_op_b32csel: @@ -2658,7 +2753,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, brw_imm_d(tcs_key->input_vertices)); break; - case nir_intrinsic_barrier: { + case nir_intrinsic_control_barrier: { if (tcs_prog_data->instances == 1) break; @@ -3070,7 +3165,15 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, static fs_reg fetch_render_target_array_index(const fs_builder &bld) { - if (bld.shader->devinfo->gen >= 6) { + if (bld.shader->devinfo->gen >= 12) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r1.1. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), + brw_imm_uw(0x7ff)); + return idx; + } else if (bld.shader->devinfo->gen >= 6) { /* The render target array index is provided in the thread payload as * bits 26:16 of r0.0. */ @@ -3220,44 +3323,6 @@ alloc_frag_output(fs_visitor *v, unsigned location) unreachable("Invalid location"); } -/* Annoyingly, we get the barycentrics into the shader in a layout that's - * optimized for PLN but it doesn't work nearly as well as one would like for - * manual interpolation. - */ -static void -shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data) -{ - dest.type = BRW_REGISTER_TYPE_F; - pln_data.type = BRW_REGISTER_TYPE_F; - const fs_reg dest_u = offset(dest, bld, 0); - const fs_reg dest_v = offset(dest, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(horiz_offset(dest_u, g * 8), - byte_offset(pln_data, (g * 2 + 0) * REG_SIZE)); - gbld.MOV(horiz_offset(dest_v, g * 8), - byte_offset(pln_data, (g * 2 + 1) * REG_SIZE)); - } -} - -static void -shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src) -{ - pln_data.type = BRW_REGISTER_TYPE_F; - src.type = BRW_REGISTER_TYPE_F; - const fs_reg src_u = offset(src, bld, 0); - const fs_reg src_v = offset(src, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE), - horiz_offset(src_u, g * 8)); - gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE), - horiz_offset(src_v, g * 8)); - } -} - void fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3301,7 +3366,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0)); mov->predicate = BRW_PREDICATE_NORMAL; mov->predicate_inverse = true; - mov->flag_subreg = 1; + mov->flag_subreg = sample_mask_flag_subreg(this); break; } @@ -3356,9 +3421,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, case nir_intrinsic_discard: case nir_intrinsic_demote_if: case nir_intrinsic_discard_if: { - /* We track our discarded pixels in f0.1. By predicating on it, we can - * update just the flag bits that aren't yet discarded. If there's no - * condition, we emit a CMP of g0 != g0, so all currently executing + /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we + * can update just the flag bits that aren't yet discarded. If there's + * no condition, we emit a CMP of g0 != g0, so all currently executing * channels will get turned off. */ fs_inst *cmp = NULL; @@ -3368,7 +3433,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (alu != NULL && alu->op != nir_op_bcsel && - alu->op != nir_op_inot) { + (devinfo->gen > 5 || + (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || + alu->op == nir_op_fne32 || alu->op == nir_op_feq32 || + alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || + alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || + alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || + alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) { /* Re-emit the instruction that generated the Boolean value, but * do not store it. Since this instruction will be conditional, * other instructions that want to use the real Boolean value may @@ -3411,17 +3482,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, } cmp->predicate = BRW_PREDICATE_NORMAL; - cmp->flag_subreg = 1; + cmp->flag_subreg = sample_mask_flag_subreg(this); - if (devinfo->gen >= 6) { - /* Due to the way we implement discard, the jump will only happen - * when the whole quad is discarded. So we can do this even for - * demote as it won't break its uniformity promises. - */ - emit_discard_jump(); - } + emit_discard_jump(); - limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode."); + if (devinfo->gen < 7) + limit_dispatch_width( + 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); break; } @@ -3465,8 +3532,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, instr->intrinsic); - - shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]); + const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), + offset(this->delta_xy[bary], bld, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); break; } @@ -3474,13 +3542,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const glsl_interp_mode interpolation = (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (nir_src_is_const(instr->src[0])) { unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(msg_data), interpolation); @@ -3495,9 +3562,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ - msg_data, + component(msg_data, 0), interpolation); } else { /* Make a loop that sends a message to the pixel interpolater @@ -3523,7 +3590,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ component(msg_data, 0), interpolation); @@ -3535,7 +3602,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, bld.emit(BRW_OPCODE_WHILE)); } } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3545,7 +3611,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (const_offset) { assert(nir_src_bit_size(instr->src[0]) == 32); unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; @@ -3553,7 +3618,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(off_x | (off_y << 4)), interpolation); @@ -3590,12 +3655,11 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; emit_pixel_interpolater_send(bld, opcode, - tmp, + dest, src, brw_imm_ud(0u), interpolation); } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3615,25 +3679,19 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || bary_intrin == nir_intrinsic_load_barycentric_at_sample) { - /* Use the result of the PI message. Because the load_barycentric - * intrinsics return a regular vec2 and we need it in PLN layout, we - * have to do a translation. Fortunately, copy-prop cleans this up - * reliably. - */ - dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0])); + /* Use the result of the PI message. */ + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); } else { /* Use the delta_xy values computed from the payload */ enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, bary_intrin); - dst_xy = this->delta_xy[bary]; } for (unsigned int i = 0; i < instr->num_components; i++) { fs_reg interp = - interp_reg(nir_intrinsic_base(instr), - nir_intrinsic_component(instr) + i); + component(interp_reg(nir_intrinsic_base(instr), + nir_intrinsic_component(instr) + i), 0); interp.type = BRW_REGISTER_TYPE_F; dest.type = BRW_REGISTER_TYPE_F; @@ -3654,25 +3712,11 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, } } -static int -get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) -{ - if (nir_src_is_const(instr->src[src])) { - int64_t add_val = nir_src_as_int(instr->src[src]); - if (add_val == 1) - return BRW_AOP_INC; - else if (add_val == -1) - return BRW_AOP_DEC; - } - - return BRW_AOP_ADD; -} - void fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); fs_reg dest; @@ -3680,7 +3724,17 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, dest = get_nir_dest(instr->dest); switch (instr->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: + /* The whole workgroup fits in a single HW thread, so all the + * invocations are already executed lock-step. Instead of an actual + * barrier just emit a scheduling fence, that will generate no code. + */ + if (!nir->info.cs.local_size_variable && + workgroup_size() <= dispatch_width) { + bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); + break; + } + emit_barrier(); cs_prog_data->uses_barrier = true; break; @@ -3721,48 +3775,26 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, } case nir_intrinsic_shared_atomic_add: - nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_shared_atomic_imin: - nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_shared_atomic_umin: - nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_shared_atomic_imax: - nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_shared_atomic_umax: - nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_shared_atomic_and: - nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_shared_atomic_or: - nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_shared_atomic_xor: - nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_shared_atomic_exchange: - nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_shared_atomic_comp_swap: - nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_shared_atomic_fmin: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_shared_atomic_fmax: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_shared_atomic_fcomp_swap: - nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_shared: { assert(devinfo->gen >= 7); - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); const unsigned bit_size = nir_dest_bit_size(instr->dest); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -3774,15 +3806,17 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); /* Read the vector */ - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); inst->size_written = instr->num_components * dispatch_width * 4; } else { - assert(nir_dest_bit_size(instr->dest) <= 32); assert(nir_dest_num_components(instr->dest) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -3796,7 +3830,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, case nir_intrinsic_store_shared: { assert(devinfo->gen >= 7); - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); const unsigned bit_size = nir_src_bit_size(instr->src[0]); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -3807,17 +3841,18 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_intrinsic_write_mask(instr) == (1u << instr->num_components) - 1); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { assert(nir_src_num_components(instr->src[0]) <= 4); srcs[SURFACE_LOGICAL_SRC_DATA] = data; srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_src_num_components(instr->src[0]) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -3830,6 +3865,16 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_local_group_size: { + assert(compiler->lower_variable_group_size); + assert(nir->info.cs.local_size_variable); + for (unsigned i = 0; i < 3; i++) { + bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), + group_size[i]); + } + break; + } + default: nir_emit_intrinsic(bld, instr); break; @@ -3842,8 +3887,14 @@ brw_nir_reduction_op_identity(const fs_builder &bld, { nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); switch (type_sz(type)) { + case 1: + if (type == BRW_REGISTER_TYPE_UB) { + return brw_imm_uw(value.u8); + } else { + assert(type == BRW_REGISTER_TYPE_B); + return brw_imm_w(value.i8); + } case 2: - assert(type != BRW_REGISTER_TYPE_HF); return retype(brw_imm_uw(value.u16), type); case 4: return retype(brw_imm_ud(value.u32), type); @@ -3906,17 +3957,20 @@ fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, nir_intrinsic_instr *instr) { fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); + fs_reg surf_index = image; if (stage_prog_data->binding_table.image_start > 0) { if (image.file == BRW_IMMEDIATE_VALUE) { - image.d += stage_prog_data->binding_table.image_start; + surf_index = + brw_imm_ud(image.d + stage_prog_data->binding_table.image_start); } else { - bld.ADD(image, image, + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, image, brw_imm_d(stage_prog_data->binding_table.image_start)); } } - return bld.emit_uniformize(image); + return bld.emit_uniformize(surf_index); } fs_reg @@ -3940,25 +3994,59 @@ fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, return bld.emit_uniformize(surf_index); } -static unsigned -image_intrinsic_coord_components(nir_intrinsic_instr *instr) +/** + * The offsets we get from NIR act as if each SIMD channel has it's own blob + * of contiguous space. However, if we actually place each SIMD channel in + * it's own space, we end up with terrible cache performance because each SIMD + * channel accesses a different cache line even when they're all accessing the + * same byte offset. To deal with this problem, we swizzle the address using + * a simple algorithm which ensures that any time a SIMD message reads or + * writes the same address, it's all in the same cache line. We have to keep + * the bottom two bits fixed so that we can read/write up to a dword at a time + * and the individual element is contiguous. We do this by splitting the + * address as follows: + * + * 31 4-6 2 0 + * +-------------------------------+------------+----------+ + * | Hi address bits | chan index | addr low | + * +-------------------------------+------------+----------+ + * + * In other words, the bottom two address bits stay, and the top 30 get + * shifted up so that we can stick the SIMD channel index in the middle. This + * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit + * at the same logical offset, the scratch read/write instruction acts on + * continuous elements and we get good cache locality. + */ +fs_reg +fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, + const fs_reg &nir_addr, + bool in_dwords) { - switch (nir_intrinsic_image_dim(instr)) { - case GLSL_SAMPLER_DIM_1D: - return 1 + nir_intrinsic_image_array(instr); - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - return 2 + nir_intrinsic_image_array(instr); - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - return 3; - case GLSL_SAMPLER_DIM_BUF: - return 1; - case GLSL_SAMPLER_DIM_MS: - return 2 + nir_intrinsic_image_array(instr); - default: - unreachable("Invalid image dimension"); - } + const fs_reg &chan_index = + nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + const unsigned chan_index_bits = ffs(dispatch_width) - 1; + + fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + if (in_dwords) { + /* In this case, we know the address is aligned to a DWORD and we want + * the final address in DWORDs. + */ + bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); + bld.OR(addr, addr, chan_index); + } else { + /* This case substantially more annoying because we have to pay + * attention to those pesky two bottom bits. + */ + fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); + bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); + fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); + bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); + bld.OR(addr, addr, addr_hi); + bld.OR(addr, addr, chan_addr); + } + return addr; } void @@ -3993,13 +4081,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_bindless_image_atomic_xor: case nir_intrinsic_bindless_image_atomic_exchange: case nir_intrinsic_bindless_image_atomic_comp_swap: { - if (stage == MESA_SHADER_FRAGMENT && - instr->intrinsic != nir_intrinsic_image_load) - brw_wm_prog_data(prog_data)->has_side_effects = true; - /* Get some metadata from the image intrinsic. */ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; - const GLenum format = nir_intrinsic_format(instr); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -4029,7 +4112,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = - brw_imm_ud(image_intrinsic_coord_components(instr)); + brw_imm_ud(nir_image_intrinsic_coord_components(instr)); /* Emit an image load, store or atomic op. */ if (instr->intrinsic == nir_intrinsic_image_load || @@ -4046,61 +4129,11 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - int op; unsigned num_srcs = info->num_srcs; - - switch (instr->intrinsic) { - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_add: + int op = brw_aop_for_nir_intrinsic(instr); + if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { assert(num_srcs == 4); - - op = get_op_for_atomic_add(instr, 3); - - if (op != BRW_AOP_ADD) - num_srcs = 3; - break; - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_imin: - assert(format == GL_R32I); - op = BRW_AOP_IMIN; - break; - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_bindless_image_atomic_umin: - assert(format == GL_R32UI); - op = BRW_AOP_UMIN; - break; - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_imax: - assert(format == GL_R32I); - op = BRW_AOP_IMAX; - break; - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_umax: - assert(format == GL_R32UI); - op = BRW_AOP_UMAX; - break; - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_and: - op = BRW_AOP_AND; - break; - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_or: - op = BRW_AOP_OR; - break; - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_xor: - op = BRW_AOP_XOR; - break; - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_exchange: - op = BRW_AOP_MOV; - break; - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_bindless_image_atomic_comp_swap: - op = BRW_AOP_CMPWR; - break; - default: - unreachable("Not reachable."); + num_srcs = 3; } srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); @@ -4181,9 +4214,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_image_store_raw_intel: { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_image_intrinsic_image(bld, instr); @@ -4197,56 +4227,155 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_scoped_barrier: + assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); + /* Fall through. */ case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier: { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: { bool l3_fence, slm_fence; - if (devinfo->gen >= 11) { + const enum opcode opcode = + instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? + SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; + + switch (instr->intrinsic) { + case nir_intrinsic_scoped_barrier: { + nir_variable_mode modes = nir_intrinsic_memory_modes(instr); + l3_fence = modes & (nir_var_shader_out | + nir_var_mem_ssbo | + nir_var_mem_global); + slm_fence = modes & nir_var_mem_shared; + break; + } + + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: + /* For beginInvocationInterlockARB(), we will generate a memory fence + * but with a different opcode so that generator can pick SENDC + * instead of SEND. + * + * For endInvocationInterlockARB(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + * + * Handling them here will allow the logic for IVB render cache (see + * below) to be reused. + */ + l3_fence = true; + slm_fence = false; + break; + + default: l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier_shared; - } else { - /* Prior to gen11, we only have one kind of fence. */ - l3_fence = true; + break; + } + + if (stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL) + slm_fence = false; + + /* If the workgroup fits in a single HW thread, the messages for SLM are + * processed in-order and the shader itself is already synchronized so + * the memory fence is not necessary. + * + * TODO: Check if applies for many HW threads sharing same Data Port. + */ + if (!nir->info.cs.local_size_variable && + slm_fence && workgroup_size() <= dispatch_width) slm_fence = false; + + /* Prior to Gen11, there's only L3 fence, so emit that instead. */ + if (slm_fence && devinfo->gen < 11) { + slm_fence = false; + l3_fence = true; } + /* IVB does typed surface access through the render cache, so we need + * to flush it too. + */ + const bool needs_render_fence = + devinfo->gen == 7 && !devinfo->is_haswell; + /* Be conservative in Gen11+ and always stall in a fence. Since there * are two different fences, and shader might want to synchronize * between them. * - * TODO: Improve NIR so that scope and visibility information for the - * barriers is available here to make a better decision. - * - * TODO: When emitting more than one fence, it might help emit all - * the fences first and then generate the stall moves. + * TODO: Use scope and visibility information for the barriers from NIR + * to make a better decision on whether we need to stall. */ - const bool stall = devinfo->gen >= 11; + const bool stall = devinfo->gen >= 11 || needs_render_fence || + instr->intrinsic == nir_intrinsic_end_invocation_interlock; + + const bool commit_enable = stall || + devinfo->gen >= 10; /* HSD ES # 1404612949 */ + + unsigned fence_regs_count = 0; + fs_reg fence_regs[2] = {}; const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); if (l3_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - /* bti */ brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + fence_regs[fence_regs_count++] = fence->dst; + + if (needs_render_fence) { + fs_inst *render_fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(/* bti */ 0)); + render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; + + fence_regs[fence_regs_count++] = render_fence->dst; + } } if (slm_fence) { - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(stall), - brw_imm_ud(GEN7_BTI_SLM)) - ->size_written = 2 * REG_SIZE; + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + fs_inst *fence = + ubld.emit(opcode, + ubld.vgrf(BRW_REGISTER_TYPE_UD), + brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(GEN7_BTI_SLM)); + fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + fence_regs[fence_regs_count++] = fence->dst; + } + + assert(fence_regs_count <= 2); + + if (stall || fence_regs_count == 0) { + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence_regs, fence_regs_count); } break; } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_shader_clock: { /* We cannot do anything if there is an event, so ignore it for now */ const fs_reg shader_clock = get_timestamp(bld); @@ -4349,6 +4478,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr for (int i = 0; i < instr->num_components; i++) VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, base_offset, i * type_sz(dest.type)); + + prog_data->has_ubo_pull = true; } else { /* Even if we are loading doubles, a pull constant load will load * a 32-bit vec4, so should only reserve vgrf space for that. If we @@ -4388,6 +4519,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } } + prog_data->has_ubo_pull = true; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); @@ -4418,8 +4551,11 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_load_global: { assert(devinfo->gen >= 8); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest, get_nir_src(instr->src[0]), /* Address */ @@ -4429,7 +4565,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr inst->dst.component_size(inst->exec_size); } else { const unsigned bit_size = nir_dest_bit_size(instr->dest); - assert(bit_size <= 32); assert(nir_dest_num_components(instr->dest) == 1); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, @@ -4445,20 +4580,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_store_global: assert(devinfo->gen >= 8); - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_src_bit_size(instr->src[0]) <= 32); + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_src_num_components(instr->src[0]) <= 4); bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(), get_nir_src(instr->src[1]), /* Address */ get_nir_src(instr->src[0]), /* Data */ brw_imm_ud(instr->num_components)); } else { - const unsigned bit_size = nir_src_bit_size(instr->src[0]); - assert(bit_size <= 32); assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); brw_reg_type data_type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); @@ -4472,43 +4608,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; case nir_intrinsic_global_atomic_add: - nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_global_atomic_imin: - nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_global_atomic_umin: - nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_global_atomic_imax: - nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_global_atomic_umax: - nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_global_atomic_and: - nir_emit_global_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_global_atomic_or: - nir_emit_global_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_global_atomic_xor: - nir_emit_global_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_global_atomic_exchange: - nir_emit_global_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_global_atomic_comp_swap: - nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_global_atomic_fmin: - nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_global_atomic_fmax: - nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_global_atomic_fcomp_swap: - nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_ssbo: { @@ -4525,15 +4639,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); /* Read the vector */ - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_dest_bit_size(instr->dest) == 32); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_dest_bit_size(instr->dest) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_num_components(instr->dest) <= 4); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); inst->size_written = instr->num_components * dispatch_width * 4; } else { - assert(nir_dest_bit_size(instr->dest) <= 32); assert(nir_dest_num_components(instr->dest) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -4548,9 +4664,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_store_ssbo: { assert(devinfo->gen >= 7); - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - const unsigned bit_size = nir_src_bit_size(instr->src[0]); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; srcs[SURFACE_LOGICAL_SRC_SURFACE] = @@ -4561,17 +4674,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_intrinsic_write_mask(instr) == (1u << instr->num_components) - 1); - if (nir_intrinsic_align(instr) >= 4) { - assert(nir_src_bit_size(instr->src[0]) == 32); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { assert(nir_src_num_components(instr->src[0]) <= 4); srcs[SURFACE_LOGICAL_SRC_DATA] = data; srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - assert(nir_src_bit_size(instr->src[0]) <= 32); assert(nir_src_num_components(instr->src[0]) == 1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); @@ -4602,43 +4716,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_ssbo_atomic_add: - nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); - break; case nir_intrinsic_ssbo_atomic_imin: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_ssbo_atomic_umin: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_ssbo_atomic_imax: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_ssbo_atomic_umax: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_ssbo_atomic_and: - nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_ssbo_atomic_or: - nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_ssbo_atomic_xor: - nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_ssbo_atomic_exchange: - nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_ssbo_atomic_comp_swap: - nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_ssbo_atomic_fmin: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_ssbo_atomic_fmax: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_ssbo_atomic_fcomp_swap: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_get_buffer_size: { @@ -4700,6 +4792,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_dest_num_components(instr->dest) == 1); + const unsigned bit_size = nir_dest_bit_size(instr->dest); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[0]); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + assert(nir_dest_num_components(instr->dest) == 1); + assert(nir_dest_bit_size(instr->dest) <= 32); + assert(nir_intrinsic_align(instr) > 1); + if (nir_dest_bit_size(instr->dest) >= 4 && + nir_intrinsic_align(instr) >= 4) { + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, read_result); + } + break; + } + + case nir_intrinsic_store_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[1]); + + fs_reg data = get_nir_src(instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(nir_src_num_components(instr->src[0]) == 1); + assert(nir_src_bit_size(instr->src[0]) <= 32); + assert(nir_intrinsic_write_mask(instr) == 1); + assert(nir_intrinsic_align(instr) > 0); + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + case nir_intrinsic_load_subgroup_size: /* This should only happen for fragment shaders because every other case * is lowered in NIR so we can optimize on it. @@ -4982,10 +5167,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); bld.emit_scan(brw_op, scan, cluster_size, cond_mod); @@ -5028,10 +5231,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); const fs_builder allbld = bld.exec_all(); allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); @@ -5040,7 +5261,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * shift of the contents before we can begin. To make things worse, * we can't do this with a normal stride; we have to use indirects. */ - fs_reg shifted = bld.vgrf(src.type); + fs_reg shifted = bld.vgrf(scan_type); fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], brw_imm_w(-1)); @@ -5055,33 +5276,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_begin_invocation_interlock: { - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - - ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) - ->size_written = 2 * REG_SIZE; - break; - } - - case nir_intrinsic_end_invocation_interlock: { - /* For endInvocationInterlock(), we need to insert a memory fence which - * stalls in the shader until the memory transactions prior to that - * fence are complete. This ensures that the shader does not end before - * any writes from its critical section have landed. Otherwise, you can - * end up with a case where the next invocation on that pixel properly - * stalls for previous FS invocation on its pixel to complete but - * doesn't actually wait for the dataport memory transactions from that - * thread to land before submitting its own. - */ - const fs_builder ubld = bld.group(8, 0); - const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; - break; - } - default: unreachable("unknown intrinsic"); } @@ -5091,9 +5285,6 @@ void fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - /* The BTI untyped atomic messages only support 32-bit atomics. If you * just look at the big table of messages in the Vol 7 of the SKL PRM, they * appear to exist. However, if you look at Vol 2a, there are no message @@ -5133,9 +5324,6 @@ void fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); @@ -5245,9 +5433,6 @@ void fs_visitor::nir_emit_global_atomic(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); @@ -5279,9 +5464,6 @@ void fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, int op, nir_intrinsic_instr *instr) { - if (stage == MESA_SHADER_FRAGMENT) - brw_wm_prog_data(prog_data)->has_side_effects = true; - assert(nir_intrinsic_infos[instr->intrinsic].has_dest); fs_reg dest = get_nir_dest(instr->dest); @@ -5297,7 +5479,7 @@ fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, data = tmp; } - bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL, dest, addr, data, brw_imm_ud(op)); }