* allocating them. With ARB_enhanced_layouts, multiple output variables
* may occupy the same slot, but have different type sizes.
*/
- nir_foreach_variable(var, &nir->outputs) {
+ nir_foreach_shader_out_variable(var, nir) {
const int loc = var->data.driver_location;
const unsigned var_vec4s =
var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
uniforms = nir->num_uniforms / 4;
if (stage == MESA_SHADER_COMPUTE) {
- /* Add a uniform for the thread local id. It must be the last uniform
- * on the list.
- */
+ /* Add uniforms for builtins after regular NIR uniforms. */
assert(uniforms == prog_data->nr_params);
- uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+ uint32_t *param;
+ if (nir->info.cs.local_size_variable &&
+ compiler->lower_variable_group_size) {
+ param = brw_stage_prog_data_add_params(prog_data, 3);
+ for (unsigned i = 0; i < 3; i++) {
+ param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+ group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+ }
+ }
+
+ /* Subgroup ID must be the last uniform on the list. This will make
+ * easier later to split between cross thread and per thread
+ * uniforms.
+ */
+ param = brw_stage_prog_data_add_params(prog_data, 1);
*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
}
*/
nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
if (cond != NULL && cond->op == nir_op_inot) {
- assert(!cond->src[0].negate);
- assert(!cond->src[0].abs);
-
invert = true;
cond_reg = get_nir_src(cond->src[0].src);
} else {
src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
return false;
- /* If either opcode has source modifiers, bail.
- *
- * TODO: We can potentially handle source modifiers if both of the opcodes
- * we're combining are signed integers.
- */
- if (instr->src[0].abs || instr->src[0].negate ||
- src0->src[0].abs || src0->src[0].negate)
- return false;
-
unsigned element = nir_src_as_uint(src0->src[1].src);
/* Element type to extract.*/
nir_src_bit_size(src0->src[0].src)));
op0 = offset(op0, bld, src0->src[0].swizzle[0]);
- set_saturate(instr->dest.saturate,
- bld.MOV(result, subscript(op0, type, element)));
+ bld.MOV(result, subscript(op0, type, element));
return true;
}
(nir_alu_type)(nir_op_infos[instr->op].output_type |
nir_dest_bit_size(instr->dest.dest)));
+ assert(!instr->dest.saturate);
+
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ /* We don't lower to source modifiers so they should not exist. */
+ assert(!instr->src[i].abs);
+ assert(!instr->src[i].negate);
+
op[i] = get_nir_src(instr->src[i].src);
op[i].type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
nir_src_bit_size(instr->src[i].src)));
- op[i].abs = instr->src[i].abs;
- op[i].negate = instr->src[i].negate;
}
/* Move and vecN instrutions may still be vectored. Return the raw,
for (unsigned i = 0; i < 2; i++) {
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
- if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
- !inot_instr->src[0].abs && !inot_instr->src[0].negate) {
+ if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
/* The source of the inot is now the source of instr. */
prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
const nir_alu_instr *const fsign_instr =
nir_src_as_alu_instr(instr->src[fsign_src].src);
- assert(!fsign_instr->dest.saturate);
-
/* op[fsign_src] has the nominal result of the fsign, and op[1 -
* fsign_src] has the other multiply source. This must be rearranged so
* that op[0] is the source of the fsign op[1] is the other multiply
nir_src_bit_size(fsign_instr->src[0].src));
op[0].type = brw_type_for_nir_type(devinfo, t);
- op[0].abs = fsign_instr->src[0].abs;
- op[0].negate = fsign_instr->src[0].negate;
unsigned channel = 0;
if (nir_op_infos[instr->op].output_size == 0) {
}
op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
-
- /* Resolve any source modifiers. We could do slightly better on Gen8+
- * if the only source modifier is negation, but *shrug*.
- */
- if (op[1].negate || op[1].abs) {
- fs_reg tmp = bld.vgrf(op[1].type);
-
- bld.MOV(tmp, op[1]);
- op[1] = tmp;
- }
- } else {
- assert(!instr->dest.saturate);
}
- if (op[0].abs) {
- /* Straightforward since the source can be assumed to be either strictly
- * >= 0 or strictly <= 0 depending on the setting of the negate flag.
- */
- set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
-
- if (instr->op == nir_op_fsign) {
- inst = (op[0].negate)
- ? bld.MOV(result, brw_imm_f(-1.0f))
- : bld.MOV(result, brw_imm_f(1.0f));
- } else {
- op[1].negate = (op[0].negate != op[1].negate);
- inst = bld.MOV(result, op[1]);
- }
-
- set_predicate(BRW_PREDICATE_NORMAL, inst);
- } else if (type_sz(op[0].type) == 2) {
+ if (type_sz(op[0].type) == 2) {
/* AND(val, 0x8000) gives the sign bit.
*
* Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
* have already been taken (in nir_opt_algebraic) to ensure that.
*/
return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
- is_used_once(fsign_instr) &&
- !instr->src[fsign_src].abs && !instr->src[fsign_src].negate;
+ is_used_once(fsign_instr);
}
void
inst = bld.MOV(offset(temp, bld, i),
offset(op[i], bld, instr->src[i].swizzle[0]));
}
- inst->saturate = instr->dest.saturate;
}
/* In this case the source and destination registers were the same,
if (optimize_extract_to_float(instr, result))
return;
inst = bld.MOV(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_f2f16_rtne:
*/
assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
}
assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fsat:
case nir_op_ineg:
op[0].negate = true;
inst = bld.MOV(result, op[0]);
- if (instr->op == nir_op_fneg)
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fabs:
op[0].negate = false;
op[0].abs = true;
inst = bld.MOV(result, op[0]);
- if (instr->op == nir_op_fabs)
- inst->saturate = instr->dest.saturate;
break;
case nir_op_f2f32:
assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fsign:
case nir_op_frcp:
inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fexp2:
inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_flog2:
inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fsin:
inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fcos:
inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddx:
} else {
inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
}
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddx_fine:
inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddx_coarse:
inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddy:
if (fs_key->high_quality_derivatives) {
} else {
inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
}
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddy_fine:
inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fddy_coarse:
inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fadd:
/* fallthrough */
case nir_op_iadd:
inst = bld.ADD(result, op[0], op[1]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_iadd_sat:
}
inst = bld.MUL(result, op[0], op[1]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_imul_2x32_64:
if (inot_src_instr != NULL &&
(inot_src_instr->op == nir_op_ior ||
inot_src_instr->op == nir_op_ixor ||
- inot_src_instr->op == nir_op_iand) &&
- !inot_src_instr->src[0].abs &&
- !inot_src_instr->src[0].negate &&
- !inot_src_instr->src[1].abs &&
- !inot_src_instr->src[1].negate) {
+ inot_src_instr->op == nir_op_iand)) {
/* The sources of the source logical instruction are now the
* sources of the instruction that will be generated.
*/
case nir_op_b32any_inequal4:
unreachable("Lowered by nir_lower_alu_reductions");
- case nir_op_fnoise1_1:
- case nir_op_fnoise1_2:
- case nir_op_fnoise1_3:
- case nir_op_fnoise1_4:
- case nir_op_fnoise2_1:
- case nir_op_fnoise2_2:
- case nir_op_fnoise2_3:
- case nir_op_fnoise2_4:
- case nir_op_fnoise3_1:
- case nir_op_fnoise3_2:
- case nir_op_fnoise3_3:
- case nir_op_fnoise3_4:
- case nir_op_fnoise4_1:
- case nir_op_fnoise4_2:
- case nir_op_fnoise4_3:
- case nir_op_fnoise4_4:
- unreachable("not reached: should be handled by lower_noise");
-
case nir_op_ldexp:
unreachable("not reached: should be handled by ldexp_to_arith()");
case nir_op_fsqrt:
inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_frsq:
inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_i2b32:
bld.ADD(result, result, brw_imm_f(1.0f)));
inst = bld.MOV(result, result); /* for potential saturation */
}
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fceil: {
bld.RNDD(temp, op[0]);
temp.negate = true;
inst = bld.MOV(result, temp);
- inst->saturate = instr->dest.saturate;
break;
}
case nir_op_ffloor:
inst = bld.RNDD(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_ffract:
inst = bld.FRC(result, op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fround_even:
inst = bld.RNDE(result, op[0]);
bld.ADD(result, result, brw_imm_f(1.0f)));
inst = bld.MOV(result, result); /* for potential saturation */
}
- inst->saturate = instr->dest.saturate;
break;
case nir_op_fquantize2f16: {
/* Select that or zero based on normal status */
inst = bld.SEL(result, zero, tmp32);
inst->predicate = BRW_PREDICATE_NORMAL;
- inst->saturate = instr->dest.saturate;
break;
}
case nir_op_umin:
case nir_op_fmin:
inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_imax:
case nir_op_umax:
case nir_op_fmax:
inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_pack_snorm_2x16:
case nir_op_unpack_half_2x16_split_x:
inst = bld.emit(BRW_OPCODE_F16TO32, result,
subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
- inst->saturate = instr->dest.saturate;
break;
case nir_op_unpack_half_2x16_split_y_flush_to_zero:
case nir_op_unpack_half_2x16_split_y:
inst = bld.emit(BRW_OPCODE_F16TO32, result,
subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
- inst->saturate = instr->dest.saturate;
break;
case nir_op_pack_64_2x32_split:
case nir_op_fpow:
inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_bitfield_reverse:
}
inst = bld.MAD(result, op[2], op[1], op[0]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_flrp:
}
inst = bld.LRP(result, op[0], op[1], op[2]);
- inst->saturate = instr->dest.saturate;
break;
case nir_op_b32csel:
static fs_reg
fetch_render_target_array_index(const fs_builder &bld)
{
- if (bld.shader->devinfo->gen >= 6) {
+ if (bld.shader->devinfo->gen >= 12) {
+ /* The render target array index is provided in the thread payload as
+ * bits 26:16 of r1.1.
+ */
+ const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
+ brw_imm_uw(0x7ff));
+ return idx;
+ } else if (bld.shader->devinfo->gen >= 6) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r0.0.
*/
cmp->predicate = BRW_PREDICATE_NORMAL;
cmp->flag_subreg = sample_mask_flag_subreg(this);
- if (devinfo->gen >= 6) {
- /* Due to the way we implement discard, the jump will only happen
- * when the whole quad is discarded. So we can do this even for
- * demote as it won't break its uniformity promises.
- */
- emit_discard_jump();
- }
+ emit_discard_jump();
if (devinfo->gen < 7)
limit_dispatch_width(
* invocations are already executed lock-step. Instead of an actual
* barrier just emit a scheduling fence, that will generate no code.
*/
- if (workgroup_size() <= dispatch_width) {
+ if (!nir->info.cs.local_size_variable &&
+ workgroup_size() <= dispatch_width) {
bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
break;
}
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_dest_bit_size(instr->dest) == 32);
+ assert(nir_dest_bit_size(instr->dest) <= 32);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_dest_bit_size(instr->dest) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
+ assert(nir_dest_num_components(instr->dest) <= 4);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
} else {
- assert(nir_dest_bit_size(instr->dest) <= 32);
assert(nir_dest_num_components(instr->dest) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+ assert(nir_src_bit_size(instr->src[0]) <= 32);
assert(nir_intrinsic_write_mask(instr) ==
(1u << instr->num_components) - 1);
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_src_bit_size(instr->src[0]) == 32);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_src_bit_size(instr->src[0]) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
assert(nir_src_num_components(instr->src[0]) <= 4);
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
- assert(nir_src_bit_size(instr->src[0]) <= 32);
assert(nir_src_num_components(instr->src[0]) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
break;
}
+ case nir_intrinsic_load_local_group_size: {
+ assert(compiler->lower_variable_group_size);
+ assert(nir->info.cs.local_size_variable);
+ for (unsigned i = 0; i < 3; i++) {
+ bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+ group_size[i]);
+ }
+ break;
+ }
+
default:
nir_emit_intrinsic(bld, instr);
break;
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_comp_swap: {
- if (stage == MESA_SHADER_FRAGMENT &&
- instr->intrinsic != nir_intrinsic_image_load)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
/* Get some metadata from the image intrinsic. */
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
}
case nir_intrinsic_image_store_raw_intel: {
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_image_intrinsic_image(bld, instr);
break;
}
- case nir_intrinsic_scoped_memory_barrier:
+ case nir_intrinsic_scoped_barrier:
+ assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE);
+ /* Fall through. */
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier: {
+ case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock: {
bool l3_fence, slm_fence;
- if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+ const enum opcode opcode =
+ instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+ SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_scoped_barrier: {
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
l3_fence = modes & (nir_var_shader_out |
nir_var_mem_ssbo |
nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
- } else {
+ break;
+ }
+
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock:
+ /* For beginInvocationInterlockARB(), we will generate a memory fence
+ * but with a different opcode so that generator can pick SENDC
+ * instead of SEND.
+ *
+ * For endInvocationInterlockARB(), we need to insert a memory fence which
+ * stalls in the shader until the memory transactions prior to that
+ * fence are complete. This ensures that the shader does not end before
+ * any writes from its critical section have landed. Otherwise, you can
+ * end up with a case where the next invocation on that pixel properly
+ * stalls for previous FS invocation on its pixel to complete but
+ * doesn't actually wait for the dataport memory transactions from that
+ * thread to land before submitting its own.
+ *
+ * Handling them here will allow the logic for IVB render cache (see
+ * below) to be reused.
+ */
+ l3_fence = true;
+ slm_fence = false;
+ break;
+
+ default:
l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+ break;
}
if (stage != MESA_SHADER_COMPUTE)
*
* TODO: Check if applies for many HW threads sharing same Data Port.
*/
- if (slm_fence && workgroup_size() <= dispatch_width)
+ if (!nir->info.cs.local_size_variable &&
+ slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
/* Prior to Gen11, there's only L3 fence, so emit that instead. */
l3_fence = true;
}
+ /* IVB does typed surface access through the render cache, so we need
+ * to flush it too.
+ */
+ const bool needs_render_fence =
+ devinfo->gen == 7 && !devinfo->is_haswell;
+
/* Be conservative in Gen11+ and always stall in a fence. Since there
* are two different fences, and shader might want to synchronize
* between them.
*
- * TODO: Improve NIR so that scope and visibility information for the
- * barriers is available here to make a better decision.
- *
- * TODO: When emitting more than one fence, it might help emit all
- * the fences first and then generate the stall moves.
+ * TODO: Use scope and visibility information for the barriers from NIR
+ * to make a better decision on whether we need to stall.
*/
- const bool stall = devinfo->gen >= 11;
+ const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+ instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+ const bool commit_enable = stall ||
+ devinfo->gen >= 10; /* HSD ES # 1404612949 */
+
+ unsigned fence_regs_count = 0;
+ fs_reg fence_regs[2] = {};
const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
if (l3_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- /* bti */ brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ fence_regs[fence_regs_count++] = fence->dst;
+
+ if (needs_render_fence) {
+ fs_inst *render_fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+ fence_regs[fence_regs_count++] = render_fence->dst;
+ }
}
if (slm_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- brw_imm_ud(GEN7_BTI_SLM))
- ->size_written = 2 * REG_SIZE;
+ assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(GEN7_BTI_SLM));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ fence_regs[fence_regs_count++] = fence->dst;
}
- if (!l3_fence && !slm_fence)
- ubld.emit(FS_OPCODE_SCHEDULING_FENCE);
+ assert(fence_regs_count <= 2);
+
+ if (stall || fence_regs_count == 0) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence_regs, fence_regs_count);
+ }
break;
}
case nir_intrinsic_load_global: {
assert(devinfo->gen >= 8);
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_dest_bit_size(instr->dest) == 32);
+ assert(nir_dest_bit_size(instr->dest) <= 32);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_dest_bit_size(instr->dest) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
+ assert(nir_dest_num_components(instr->dest) <= 4);
fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
dest,
get_nir_src(instr->src[0]), /* Address */
inst->dst.component_size(inst->exec_size);
} else {
const unsigned bit_size = nir_dest_bit_size(instr->dest);
- assert(bit_size <= 32);
assert(nir_dest_num_components(instr->dest) == 1);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
case nir_intrinsic_store_global:
assert(devinfo->gen >= 8);
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_src_bit_size(instr->src[0]) == 32);
+ assert(nir_src_bit_size(instr->src[0]) <= 32);
+ assert(nir_intrinsic_write_mask(instr) ==
+ (1u << instr->num_components) - 1);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_src_bit_size(instr->src[0]) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
+ assert(nir_src_num_components(instr->src[0]) <= 4);
bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
fs_reg(),
get_nir_src(instr->src[1]), /* Address */
get_nir_src(instr->src[0]), /* Data */
brw_imm_ud(instr->num_components));
} else {
- const unsigned bit_size = nir_src_bit_size(instr->src[0]);
- assert(bit_size <= 32);
assert(nir_src_num_components(instr->src[0]) == 1);
+ const unsigned bit_size = nir_src_bit_size(instr->src[0]);
brw_reg_type data_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_dest_bit_size(instr->dest) == 32);
+ assert(nir_dest_bit_size(instr->dest) <= 32);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_dest_bit_size(instr->dest) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
+ assert(nir_dest_num_components(instr->dest) <= 4);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
} else {
- assert(nir_dest_bit_size(instr->dest) <= 32);
assert(nir_dest_num_components(instr->dest) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
case nir_intrinsic_store_ssbo: {
assert(devinfo->gen >= 7);
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+ assert(nir_src_bit_size(instr->src[0]) <= 32);
assert(nir_intrinsic_write_mask(instr) ==
(1u << instr->num_components) - 1);
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_src_bit_size(instr->src[0]) == 32);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_src_bit_size(instr->src[0]) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
assert(nir_src_num_components(instr->src[0]) <= 4);
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
- assert(nir_src_bit_size(instr->src[0]) <= 32);
assert(nir_src_num_components(instr->src[0]) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_dest_bit_size(instr->dest) == 32);
-
+ assert(nir_dest_num_components(instr->dest) == 1);
+ assert(nir_dest_bit_size(instr->dest) <= 32);
+ assert(nir_intrinsic_align(instr) > 1);
+ if (nir_dest_bit_size(instr->dest) >= 4 &&
+ nir_intrinsic_align(instr) >= 4) {
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, true);
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
- assert(nir_dest_bit_size(instr->dest) <= 32);
-
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, false);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
- assert(nir_intrinsic_write_mask(instr) ==
- (1u << instr->num_components) - 1);
- if (nir_intrinsic_align(instr) >= 4) {
- assert(nir_src_bit_size(instr->src[0]) == 32);
+ assert(nir_src_num_components(instr->src[0]) == 1);
+ assert(nir_src_bit_size(instr->src[0]) <= 32);
+ assert(nir_intrinsic_write_mask(instr) == 1);
+ assert(nir_intrinsic_align(instr) > 0);
+ if (nir_src_bit_size(instr->src[0]) == 32 &&
+ nir_intrinsic_align(instr) >= 4) {
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
/* The offset for a DWORD scattered message is in dwords. */
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
- assert(nir_src_bit_size(instr->src[0]) <= 32);
-
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
break;
}
- case nir_intrinsic_begin_invocation_interlock: {
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
- ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
- case nir_intrinsic_end_invocation_interlock: {
- /* For endInvocationInterlock(), we need to insert a memory fence which
- * stalls in the shader until the memory transactions prior to that
- * fence are complete. This ensures that the shader does not end before
- * any writes from its critical section have landed. Otherwise, you can
- * end up with a case where the next invocation on that pixel properly
- * stalls for previous FS invocation on its pixel to complete but
- * doesn't actually wait for the dataport memory transactions from that
- * thread to land before submitting its own.
- */
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
default:
unreachable("unknown intrinsic");
}
fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
int op, nir_intrinsic_instr *instr)
{
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
/* The BTI untyped atomic messages only support 32-bit atomics. If you
* just look at the big table of messages in the Vol 7 of the SKL PRM, they
* appear to exist. However, if you look at Vol 2a, there are no message
fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
int op, nir_intrinsic_instr *instr)
{
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_dest(instr->dest);
fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
int op, nir_intrinsic_instr *instr)
{
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_dest(instr->dest);
fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
int op, nir_intrinsic_instr *instr)
{
- if (stage == MESA_SHADER_FRAGMENT)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
fs_reg dest = get_nir_dest(instr->dest);
data = tmp;
}
- bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
+ bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL,
dest, addr, data, brw_imm_ud(op));
}