#include "compiler/glsl/ir.h"
#include "brw_fs.h"
#include "brw_nir.h"
+#include "nir_search_helpers.h"
#include "util/u_math.h"
#include "util/bitscan.h"
unsigned array_elems =
reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
unsigned size = array_elems * reg->num_components;
- const brw_reg_type reg_type =
+ const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
nir_locals[reg->index] = bld.vgrf(reg_type, size);
}
/* If the condition has the form !other_condition, use other_condition as
* the source, but invert the predicate on the if instruction.
*/
- nir_alu_instr *const cond = nir_src_as_alu_instr(&if_stmt->condition);
+ nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
if (cond != NULL && cond->op == nir_op_inot) {
assert(!cond->src[0].negate);
assert(!cond->src[0].abs);
nir_emit_cf_list(&if_stmt->then_list);
- /* note: if the else is empty, dead CF elimination will remove it */
- bld.emit(BRW_OPCODE_ELSE);
-
- nir_emit_cf_list(&if_stmt->else_list);
+ if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
+ bld.emit(BRW_OPCODE_ELSE);
+ nir_emit_cf_list(&if_stmt->else_list);
+ }
bld.emit(BRW_OPCODE_ENDIF);
break;
case nir_instr_type_deref:
- /* Derefs can exist for images but they do nothing */
+ unreachable("All derefs should've been lowered");
break;
case nir_instr_type_intrinsic:
fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result)
{
- if (!instr->src[0].src.is_ssa ||
- instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
- return false;
-
- nir_intrinsic_instr *src0 =
- nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
-
- if (src0->intrinsic != nir_intrinsic_load_front_face)
+ nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
+ if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
return false;
if (!nir_src_is_const(instr->src[1].src) ||
fs_reg *op)
{
for (unsigned i = 0; i < 2; i++) {
- nir_alu_instr *const inot_instr =
- nir_src_as_alu_instr(&instr->src[i].src);
+ nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
!inot_instr->src[0].abs && !inot_instr->src[0].negate) {
if (devinfo->gen < 6 || devinfo->gen >= 12)
return false;
- nir_alu_instr *const inot_instr = nir_src_as_alu_instr(&instr->src[0].src);
+ nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
if (inot_instr == NULL || inot_instr->op != nir_op_inot)
return false;
return true;
}
+/**
+ * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
+ *
+ * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
+ * the source of \c instr that is a \c nir_op_fsign.
+ */
+void
+fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
+ fs_reg result, fs_reg *op, unsigned fsign_src)
+{
+ fs_inst *inst;
+
+ assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
+ assert(fsign_src < nir_op_infos[instr->op].num_inputs);
+
+ if (instr->op != nir_op_fsign) {
+ const nir_alu_instr *const fsign_instr =
+ nir_src_as_alu_instr(instr->src[fsign_src].src);
+
+ assert(!fsign_instr->dest.saturate);
+
+ /* op[fsign_src] has the nominal result of the fsign, and op[1 -
+ * fsign_src] has the other multiply source. This must be rearranged so
+ * that op[0] is the source of the fsign op[1] is the other multiply
+ * source.
+ */
+ if (fsign_src != 0)
+ op[1] = op[0];
+
+ op[0] = get_nir_src(fsign_instr->src[0].src);
+
+ const nir_alu_type t =
+ (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
+ nir_src_bit_size(fsign_instr->src[0].src));
+
+ op[0].type = brw_type_for_nir_type(devinfo, t);
+ op[0].abs = fsign_instr->src[0].abs;
+ op[0].negate = fsign_instr->src[0].negate;
+
+ unsigned channel = 0;
+ if (nir_op_infos[instr->op].output_size == 0) {
+ /* Since NIR is doing the scalarizing for us, we should only ever see
+ * vectorized operations with a single channel.
+ */
+ assert(util_bitcount(instr->dest.write_mask) == 1);
+ channel = ffs(instr->dest.write_mask) - 1;
+ }
+
+ op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
+ } else {
+ assert(!instr->dest.saturate);
+ }
+
+ if (op[0].abs) {
+ /* Straightforward since the source can be assumed to be either strictly
+ * >= 0 or strictly <= 0 depending on the setting of the negate flag.
+ */
+ set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
+
+ if (instr->op == nir_op_fsign) {
+ inst = (op[0].negate)
+ ? bld.MOV(result, brw_imm_f(-1.0f))
+ : bld.MOV(result, brw_imm_f(1.0f));
+ } else {
+ op[1].negate = (op[0].negate != op[1].negate);
+ inst = bld.MOV(result, op[1]);
+ }
+
+ set_predicate(BRW_PREDICATE_NORMAL, inst);
+ } else if (type_sz(op[0].type) == 2) {
+ /* AND(val, 0x8000) gives the sign bit.
+ *
+ * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
+ */
+ fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+ op[0].type = BRW_REGISTER_TYPE_UW;
+ result.type = BRW_REGISTER_TYPE_UW;
+ bld.AND(result, op[0], brw_imm_uw(0x8000u));
+
+ if (instr->op == nir_op_fsign)
+ inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
+ else {
+ /* Use XOR here to get the result sign correct. */
+ inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
+ }
+
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ } else if (type_sz(op[0].type) == 4) {
+ /* AND(val, 0x80000000) gives the sign bit.
+ *
+ * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+ * zero.
+ */
+ bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+
+ op[0].type = BRW_REGISTER_TYPE_UD;
+ result.type = BRW_REGISTER_TYPE_UD;
+ bld.AND(result, op[0], brw_imm_ud(0x80000000u));
+
+ if (instr->op == nir_op_fsign)
+ inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
+ else {
+ /* Use XOR here to get the result sign correct. */
+ inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
+ }
+
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ } else {
+ /* For doubles we do the same but we need to consider:
+ *
+ * - 2-src instructions can't operate with 64-bit immediates
+ * - The sign is encoded in the high 32-bit of each DF
+ * - We need to produce a DF result.
+ */
+
+ fs_reg zero = vgrf(glsl_type::double_type);
+ bld.MOV(zero, setup_imm_df(bld, 0.0));
+ bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+ bld.MOV(result, zero);
+
+ fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
+ bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
+ brw_imm_ud(0x80000000u));
+
+ if (instr->op == nir_op_fsign) {
+ set_predicate(BRW_PREDICATE_NORMAL,
+ bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
+ } else {
+ /* This could be done better in some cases. If the scale is an
+ * immediate with the low 32-bits all 0, emitting a separate XOR and
+ * OR would allow an algebraic optimization to remove the OR. There
+ * are currently zero instances of fsign(double(x))*IMM in shader-db
+ * or any test suite, so it is hard to care at this time.
+ */
+ fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
+ inst = bld.XOR(result_int64, result_int64,
+ retype(op[1], BRW_REGISTER_TYPE_UQ));
+ }
+ }
+}
+
+/**
+ * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
+ *
+ * Checks the operands of a \c nir_op_fmul to determine whether or not
+ * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
+ *
+ * \param instr The multiplication instruction
+ *
+ * \param fsign_src The source of \c instr that may or may not be a
+ * \c nir_op_fsign
+ */
+static bool
+can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
+{
+ assert(instr->op == nir_op_fmul);
+
+ nir_alu_instr *const fsign_instr =
+ nir_src_as_alu_instr(instr->src[fsign_src].src);
+
+ /* Rules:
+ *
+ * 1. instr->src[fsign_src] must be a nir_op_fsign.
+ * 2. The nir_op_fsign can only be used by this multiplication.
+ * 3. The source that is the nir_op_fsign does not have source modifiers.
+ * \c emit_fsign only examines the source modifiers of the source of the
+ * \c nir_op_fsign.
+ *
+ * The nir_op_fsign must also not have the saturate modifier, but steps
+ * have already been taken (in nir_opt_algebraic) to ensure that.
+ */
+ return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
+ is_used_once(fsign_instr) &&
+ !instr->src[fsign_src].abs && !instr->src[fsign_src].negate;
+}
+
void
fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
{
bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
/* fallthrough */
-
+ case nir_op_f2f16:
/* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
* on the HW gen, it is a special hw opcode or just a MOV, and
* brw_F32TO16 (at brw_eu_emit) would do the work to chose.
* only for gen8+, it will be better to use directly the MOV, and use
* BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
*/
-
- case nir_op_f2f16:
+ assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
op[0].type = BRW_REGISTER_TYPE_D;
op[0].negate = !op[0].negate;
/* fallthrough */
- case nir_op_f2f64:
- case nir_op_f2i64:
- case nir_op_f2u64:
- assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
- inst = bld.MOV(result, op[0]);
- inst->saturate = instr->dest.saturate;
- break;
-
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
- assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
- /* fallthrough */
+ case nir_op_f2f64:
+ case nir_op_f2i64:
+ case nir_op_f2u64:
+ case nir_op_i2i32:
+ case nir_op_u2u32:
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
- case nir_op_f2i16:
- case nir_op_f2u16:
- case nir_op_i2i32:
- case nir_op_u2u32:
- case nir_op_i2i16:
- case nir_op_u2u16:
case nir_op_i2f16:
+ case nir_op_i2i16:
case nir_op_u2f16:
+ case nir_op_u2u16:
+ case nir_op_f2i16:
+ case nir_op_f2u16:
case nir_op_i2i8:
case nir_op_u2u8:
+ case nir_op_f2i8:
+ case nir_op_f2u8:
+ if (result.type == BRW_REGISTER_TYPE_B ||
+ result.type == BRW_REGISTER_TYPE_UB ||
+ result.type == BRW_REGISTER_TYPE_HF)
+ assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+
+ if (op[0].type == BRW_REGISTER_TYPE_B ||
+ op[0].type == BRW_REGISTER_TYPE_UB ||
+ op[0].type == BRW_REGISTER_TYPE_HF)
+ assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
+
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
- case nir_op_fsign: {
- assert(!instr->dest.saturate);
- if (op[0].abs) {
- /* Straightforward since the source can be assumed to be either
- * strictly >= 0 or strictly <= 0 depending on the setting of the
- * negate flag.
- */
- set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
-
- inst = (op[0].negate)
- ? bld.MOV(result, brw_imm_f(-1.0f))
- : bld.MOV(result, brw_imm_f(1.0f));
-
- set_predicate(BRW_PREDICATE_NORMAL, inst);
- } else if (type_sz(op[0].type) < 8) {
- /* AND(val, 0x80000000) gives the sign bit.
- *
- * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
- * zero.
- */
- bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
-
- fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
- op[0].type = BRW_REGISTER_TYPE_UD;
- result.type = BRW_REGISTER_TYPE_UD;
- bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
-
- inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
- inst->predicate = BRW_PREDICATE_NORMAL;
- } else {
- /* For doubles we do the same but we need to consider:
- *
- * - 2-src instructions can't operate with 64-bit immediates
- * - The sign is encoded in the high 32-bit of each DF
- * - We need to produce a DF result.
- */
-
- fs_reg zero = vgrf(glsl_type::double_type);
- bld.MOV(zero, setup_imm_df(bld, 0.0));
- bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
-
- bld.MOV(result, zero);
-
- fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
- bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
- brw_imm_ud(0x80000000u));
-
- set_predicate(BRW_PREDICATE_NORMAL,
- bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
- }
+ case nir_op_fsign:
+ emit_fsign(bld, instr, result, op, 0);
break;
- }
case nir_op_frcp:
inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
break;
case nir_op_fmul:
+ for (unsigned i = 0; i < 2; i++) {
+ if (can_fuse_fmul_fsign(instr, i)) {
+ emit_fsign(bld, instr, result, op, i);
+ return;
+ }
+ }
+
inst = bld.MUL(result, op[0], op[1]);
inst->saturate = instr->dest.saturate;
break;
case nir_op_inot:
if (devinfo->gen >= 8) {
- nir_alu_instr *const inot_src_instr =
- nir_src_as_alu_instr(&instr->src[0].src);
+ nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
if (inot_src_instr != NULL &&
(inot_src_instr->op == nir_op_ior ||
}
}
+fs_reg
+fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+ const nir_src &vertex_src = instr->src[0];
+ nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
+ fs_reg icp_handle;
+
+ if (nir_src_is_const(vertex_src)) {
+ /* Emit a MOV to resolve <0,1,0> regioning. */
+ icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ unsigned vertex = nir_src_as_uint(vertex_src);
+ bld.MOV(icp_handle,
+ retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
+ BRW_REGISTER_TYPE_UD));
+ } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
+ vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
+ /* For the common case of only 1 instance, an array index of
+ * gl_InvocationID means reading g1. Skip all the indirect work.
+ */
+ icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+ } else {
+ /* The vertex index is non-constant. We need to use indirect
+ * addressing to fetch the proper URB handle.
+ */
+ icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ /* Each ICP handle is a single DWord (4 bytes) */
+ fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.SHL(vertex_offset_bytes,
+ retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(2u));
+
+ /* Start at g1. We might read up to 4 registers. */
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+ retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
+ brw_imm_ud(4 * REG_SIZE));
+ }
+
+ return icp_handle;
+}
+
+fs_reg
+fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+ struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+ const nir_src &vertex_src = instr->src[0];
+
+ unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
+
+ if (nir_src_is_const(vertex_src)) {
+ return fs_reg(retype(brw_vec8_grf(first_icp_handle +
+ nir_src_as_uint(vertex_src), 0),
+ BRW_REGISTER_TYPE_UD));
+ }
+
+ /* The vertex index is non-constant. We need to use indirect
+ * addressing to fetch the proper URB handle.
+ *
+ * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+ * indicating that channel <n> should read the handle from
+ * DWord <n>. We convert that to bytes by multiplying by 4.
+ *
+ * Next, we convert the vertex index to bytes by multiplying
+ * by 32 (shifting by 5), and add the two together. This is
+ * the final indirect byte offset.
+ */
+ fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+ fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+ bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+ /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+ bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+ /* Convert vertex_index to bytes (multiply by 32) */
+ bld.SHL(vertex_offset_bytes,
+ retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(5u));
+ bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+ /* Use first_icp_handle as the base offset. There is one register
+ * of URB handles per vertex, so inform the register allocator that
+ * we might read up to nir->info.gs.vertices_in registers.
+ */
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+ retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+ icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
+
+ return icp_handle;
+}
+
+struct brw_reg
+fs_visitor::get_tcs_output_urb_handle()
+{
+ struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+ if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+ return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+ } else {
+ assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+ return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+ }
+}
+
void
fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
assert(stage == MESA_SHADER_TESS_CTRL);
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+
+ bool eight_patch =
+ vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
fs_reg dst;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
switch (instr->intrinsic) {
case nir_intrinsic_load_primitive_id:
- bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+ bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
+ : brw_vec1_grf(0, 1)));
break;
case nir_intrinsic_load_invocation_id:
bld.MOV(retype(dst, invocation_id.type), invocation_id);
case nir_intrinsic_load_per_vertex_input: {
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0];
-
- const nir_src &vertex_src = instr->src[0];
-
fs_inst *inst;
- fs_reg icp_handle;
-
- if (nir_src_is_const(vertex_src)) {
- /* Emit a MOV to resolve <0,1,0> regioning. */
- icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
- unsigned vertex = nir_src_as_uint(vertex_src);
- bld.MOV(icp_handle,
- retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
- BRW_REGISTER_TYPE_UD));
- } else if (tcs_prog_data->instances == 1 &&
- vertex_src.is_ssa &&
- vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
- nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
- /* For the common case of only 1 instance, an array index of
- * gl_InvocationID means reading g1. Skip all the indirect work.
- */
- icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
- } else {
- /* The vertex index is non-constant. We need to use indirect
- * addressing to fetch the proper URB handle.
- */
- icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-
- /* Each ICP handle is a single DWord (4 bytes) */
- fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
- bld.SHL(vertex_offset_bytes,
- retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
- brw_imm_ud(2u));
-
- /* Start at g1. We might read up to 4 registers. */
- bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
- retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
- brw_imm_ud(4 * REG_SIZE));
- }
+ fs_reg icp_handle =
+ eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
+ : get_tcs_single_patch_icp_handle(bld, instr);
/* We can only read two double components with each URB read, so
* we send two read messages in that case, each one loading up to
unsigned imm_offset = instr->const_index[0];
unsigned first_component = nir_intrinsic_component(instr);
+ struct brw_reg output_handles = get_tcs_output_urb_handle();
+
fs_inst *inst;
if (indirect_offset.file == BAD_FILE) {
- /* Replicate the patch handle to all enabled channels */
+ /* This MOV replicates the output handle to all enabled channels
+ * is SINGLE_PATCH mode.
+ */
fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
- bld.MOV(patch_handle,
- retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ bld.MOV(patch_handle, output_handles);
{
if (first_component != 0) {
}
} else {
/* Indirect indexing - use per-slot offsets as well. */
- const fs_reg srcs[] = {
- retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
- indirect_offset
- };
+ const fs_reg srcs[] = { output_handles, indirect_offset };
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
if (first_component != 0) {
unsigned imm_offset = instr->const_index[0];
unsigned mask = instr->const_index[1];
unsigned header_regs = 0;
+ struct brw_reg output_handles = get_tcs_output_urb_handle();
+
fs_reg srcs[7];
- srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+ srcs[header_regs++] = output_handles;
if (indirect_offset.file != BAD_FILE) {
srcs[header_regs++] = indirect_offset;
const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
const fs_reg mcs = wm_key->multisample_fbo ?
- emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
+ emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
/* Use either a normal or a CMS texel fetch message depending on whether
* the framebuffer is single or multisample. On SKL+ use the wide CMS
for (unsigned int i = 0; i < instr->num_components; i++) {
fs_reg interp =
- component(interp_reg(nir_intrinsic_base(instr),
- nir_intrinsic_component(instr) + i), 0);
+ interp_reg(nir_intrinsic_base(instr),
+ nir_intrinsic_component(instr) + i);
interp.type = BRW_REGISTER_TYPE_F;
dest.type = BRW_REGISTER_TYPE_F;
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_image_atomic_xor:
case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_comp_swap: {
+ case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_bindless_image_load:
+ case nir_intrinsic_bindless_image_store:
+ case nir_intrinsic_bindless_image_atomic_add:
+ case nir_intrinsic_bindless_image_atomic_min:
+ case nir_intrinsic_bindless_image_atomic_max:
+ case nir_intrinsic_bindless_image_atomic_and:
+ case nir_intrinsic_bindless_image_atomic_or:
+ case nir_intrinsic_bindless_image_atomic_xor:
+ case nir_intrinsic_bindless_image_atomic_exchange:
+ case nir_intrinsic_bindless_image_atomic_comp_swap: {
if (stage == MESA_SHADER_FRAGMENT &&
instr->intrinsic != nir_intrinsic_image_load)
brw_wm_prog_data(prog_data)->has_side_effects = true;
const GLenum format = nir_intrinsic_format(instr);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
- srcs[SURFACE_LOGICAL_SRC_SURFACE] =
- get_nir_image_intrinsic_image(bld, instr);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+ get_nir_image_intrinsic_image(bld, instr);
+ break;
+
+ default:
+ /* Bindless */
+ srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
+ bld.emit_uniformize(get_nir_src(instr->src[0]));
+ break;
+ }
+
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
brw_imm_ud(image_intrinsic_coord_components(instr));
/* Emit an image load, store or atomic op. */
- if (instr->intrinsic == nir_intrinsic_image_load) {
+ if (instr->intrinsic == nir_intrinsic_image_load ||
+ instr->intrinsic == nir_intrinsic_bindless_image_load) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
- } else if (instr->intrinsic == nir_intrinsic_image_store) {
+ } else if (instr->intrinsic == nir_intrinsic_image_store ||
+ instr->intrinsic == nir_intrinsic_bindless_image_store) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
switch (instr->intrinsic) {
case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_bindless_image_atomic_add:
assert(num_srcs == 4);
op = get_op_for_atomic_add(instr, 3);
num_srcs = 3;
break;
case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_bindless_image_atomic_min:
assert(format == GL_R32UI || format == GL_R32I);
op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
break;
case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_bindless_image_atomic_max:
assert(format == GL_R32UI || format == GL_R32I);
op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
break;
case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_bindless_image_atomic_and:
op = BRW_AOP_AND;
break;
case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_bindless_image_atomic_or:
op = BRW_AOP_OR;
break;
case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_bindless_image_atomic_xor:
op = BRW_AOP_XOR;
break;
case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_bindless_image_atomic_exchange:
op = BRW_AOP_MOV;
break;
case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_bindless_image_atomic_comp_swap:
op = BRW_AOP_CMPWR;
break;
default:
break;
}
- case nir_intrinsic_image_size: {
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_bindless_image_size: {
/* Unlike the [un]typed load and store opcodes, the TXS that this turns
* into will handle the binding table index for us in the geneerator.
+ * Incidentally, this means that we can handle bindless with exactly the
+ * same code.
*/
fs_reg image = retype(get_nir_src_imm(instr->src[0]),
BRW_REGISTER_TYPE_UD);
image = bld.emit_uniformize(image);
fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
- srcs[TEX_LOGICAL_SRC_SURFACE] = image;
+ if (instr->intrinsic == nir_intrinsic_image_size)
+ srcs[TEX_LOGICAL_SRC_SURFACE] = image;
+ else
+ srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
fs_reg value = get_nir_src(instr->src[0]);
if (instr->intrinsic == nir_intrinsic_vote_feq) {
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
- value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
+ value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
}
fs_reg uniformized = bld.emit_uniformize(value);
if (stage == MESA_SHADER_FRAGMENT)
brw_wm_prog_data(prog_data)->has_side_effects = true;
+ /* The BTI untyped atomic messages only support 32-bit atomics. If you
+ * just look at the big table of messages in the Vol 7 of the SKL PRM, they
+ * appear to exist. However, if you look at Vol 2a, there are no message
+ * descriptors provided for Qword atomic ops except for A64 messages.
+ */
+ assert(nir_dest_bit_size(instr->dest) == 32);
+
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_dest(instr->dest);
data = tmp;
}
- bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
- dest, addr, data, brw_imm_ud(op));
+ if (nir_dest_bit_size(instr->dest) == 64) {
+ bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,
+ dest, addr, data, brw_imm_ud(op));
+ } else {
+ assert(nir_dest_bit_size(instr->dest) == 32);
+ bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
+ dest, addr, data, brw_imm_ud(op));
+ }
}
void
break;
}
+ case nir_tex_src_texture_handle:
+ assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
+ srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
+ srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
+ break;
+
+ case nir_tex_src_sampler_handle:
+ assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
+ srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
+ srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
+ break;
+
case nir_tex_src_ms_mcs:
assert(instr->op == nir_texop_txf_ms);
srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
srcs[TEX_LOGICAL_SRC_MCS] =
emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
instr->coord_components,
- srcs[TEX_LOGICAL_SRC_SURFACE]);
+ srcs[TEX_LOGICAL_SRC_SURFACE],
+ srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
} else {
srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
}
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
- bool shader_supports_implicit_lod = stage == MESA_SHADER_FRAGMENT ||
- (stage == MESA_SHADER_COMPUTE &&
- nir->info.cs.derivative_group != DERIVATIVE_GROUP_NONE);
-
enum opcode opcode;
switch (instr->op) {
case nir_texop_tex:
- opcode = shader_supports_implicit_lod ?
- SHADER_OPCODE_TEX_LOGICAL : SHADER_OPCODE_TXL_LOGICAL;
+ opcode = SHADER_OPCODE_TEX_LOGICAL;
break;
case nir_texop_txb:
opcode = FS_OPCODE_TXB_LOGICAL;