switch (opcode) {
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
case SHADER_OPCODE_SHADER_TIME_ADD:
- case FS_OPCODE_INTERPOLATE_AT_CENTROID:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
return 0;
}
+/**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+ return 4 * type_size_vec4(type);
+}
+
/* Attribute arrays are loaded as one vec4 per element (or matrix column),
* except for double-precision types, which are loaded as one dvec4.
*/
unsigned
fs_inst::components_read(unsigned i) const
{
+ /* Return zero if the source is not present. */
+ if (src[i].file == BAD_FILE)
+ return 0;
+
switch (opcode) {
case FS_OPCODE_LINTERP:
if (i == 0)
}
switch (src[arg].file) {
- case BAD_FILE:
- return 0;
case UNIFORM:
case IMM:
return 1;
+ case BAD_FILE:
case ARF:
case FIXED_GRF:
case VGRF:
bld.MOV(wpos, this->wpos_w);
}
-static enum brw_barycentric_mode
-barycentric_mode(enum glsl_interp_mode mode,
- bool is_centroid, bool is_sample)
+enum brw_barycentric_mode
+brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
{
- unsigned bary;
-
/* Barycentric modes don't make sense for flat inputs. */
assert(mode != INTERP_MODE_FLAT);
- if (is_sample) {
- bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
- } else if (is_centroid) {
- bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
- } else {
+ unsigned bary;
+ switch (op) {
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_at_offset:
bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
+ break;
+ case nir_intrinsic_load_barycentric_centroid:
+ bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
+ break;
+ case nir_intrinsic_load_barycentric_sample:
+ case nir_intrinsic_load_barycentric_at_sample:
+ bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+ break;
+ default:
+ unreachable("invalid intrinsic");
}
if (mode == INTERP_MODE_NOPERSPECTIVE)
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
}
-void
-fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
- const glsl_type *type,
- glsl_interp_mode interpolation_mode,
- int *location, bool mod_centroid,
- bool mod_sample)
-{
- assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-
- if (type->is_array() || type->is_matrix()) {
- const glsl_type *elem_type = glsl_get_array_element(type);
- const unsigned length = glsl_get_length(type);
-
- for (unsigned i = 0; i < length; i++) {
- emit_general_interpolation(attr, name, elem_type, interpolation_mode,
- location, mod_centroid, mod_sample);
- }
- } else if (type->is_record()) {
- for (unsigned i = 0; i < type->length; i++) {
- const glsl_type *field_type = type->fields.structure[i].type;
- emit_general_interpolation(attr, name, field_type, interpolation_mode,
- location, mod_centroid, mod_sample);
- }
- } else {
- assert(type->is_scalar() || type->is_vector());
-
- if (prog_data->urb_setup[*location] == -1) {
- /* If there's no incoming setup data for this slot, don't
- * emit interpolation for it.
- */
- *attr = offset(*attr, bld, type->vector_elements);
- (*location)++;
- return;
- }
-
- attr->type = brw_type_for_base_type(type->get_scalar_type());
-
- if (interpolation_mode == INTERP_MODE_FLAT) {
- /* Constant interpolation (flat shading) case. The SF has
- * handed us defined values in only the constant offset
- * field of the setup reg.
- */
- unsigned vector_elements = type->vector_elements;
-
- /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it is not
- * 64-bit aligned and the current implementation fails to read the
- * data properly. Instead, when there is a double input varying,
- * read it as vector of floats with twice the number of components.
- */
- if (attr->type == BRW_REGISTER_TYPE_DF) {
- vector_elements *= 2;
- attr->type = BRW_REGISTER_TYPE_F;
- }
- for (unsigned int i = 0; i < vector_elements; i++) {
- struct brw_reg interp = interp_reg(*location, i);
- interp = suboffset(interp, 3);
- interp.type = attr->type;
- bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
- *attr = offset(*attr, bld, 1);
- }
- } else {
- /* Smooth/noperspective interpolation case. */
- enum brw_barycentric_mode bary =
- barycentric_mode(interpolation_mode, mod_centroid, mod_sample);
-
- for (unsigned int i = 0; i < type->vector_elements; i++) {
- fs_reg interp(interp_reg(*location, i));
- if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
- /* Get the pixel/sample mask into f0 so that we know
- * which pixels are lit. Then, for each channel that is
- * unlit, replace the centroid data with non-centroid
- * data.
- */
- bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-
- fs_inst *inst;
- inst = bld.emit(FS_OPCODE_LINTERP, *attr,
- delta_xy[centroid_to_pixel(bary)], interp);
- inst->predicate = BRW_PREDICATE_NORMAL;
- inst->predicate_inverse = true;
- inst->no_dd_clear = true;
-
- inst = bld.emit(FS_OPCODE_LINTERP, *attr,
- delta_xy[bary], interp);
- inst->predicate = BRW_PREDICATE_NORMAL;
- inst->predicate_inverse = false;
- inst->no_dd_check = true;
- } else {
- bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);
- }
- if (devinfo->gen < 6 && interpolation_mode == INTERP_MODE_SMOOTH) {
- bld.MUL(*attr, *attr, this->pixel_w);
- }
- *attr = offset(*attr, bld, 1);
- }
- }
- (*location)++;
- }
-}
-
fs_reg *
fs_visitor::emit_frontfacing_interpolation()
{
bool coordinate_done = false;
- /* The sampler can only meaningfully compute LOD for fragment shader
- * messages. For all other stages, we change the opcode to TXL and
- * hardcode the LOD to 0.
- */
- if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
- op == SHADER_OPCODE_TEX) {
- op = SHADER_OPCODE_TXL;
- lod = brw_imm_f(0.0f);
- }
-
/* Set up the LOD info */
switch (op) {
case FS_OPCODE_TXB:
coordinate_done = true;
break;
case SHADER_OPCODE_TG4_OFFSET:
- /* gather4_po_c should have been lowered in SIMD16 mode. */
- assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
-
/* More crazy intermixing */
for (unsigned i = 0; i < 2; i++) /* u, v */
bld.MOV(sources[length++], offset(coordinate, bld, i));
return 1 << _mesa_logbase2(max_width);
}
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct brw_device_info *devinfo,
+ const fs_inst *inst)
+{
+ /* Calculate the number of coordinate components that have to be present
+ * assuming that additional arguments follow the texel coordinates in the
+ * message payload. On IVB+ there is no need for padding, on ILK-SNB we
+ * need to pad to four or three components depending on the message,
+ * pre-ILK we need to pad to at most three components.
+ */
+ const unsigned req_coord_components =
+ (devinfo->gen >= 7 ||
+ !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+ (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+ inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+ 3;
+
+ /* On Gen9+ the LOD argument is for free if we're able to use the LZ
+ * variant of the TXL or TXF message.
+ */
+ const bool implicit_lod = devinfo->gen >= 9 &&
+ (inst->opcode == SHADER_OPCODE_TXL ||
+ inst->opcode == SHADER_OPCODE_TXF) &&
+ inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+ /* Calculate the total number of argument components that need to be passed
+ * to the sampler unit.
+ */
+ const unsigned num_payload_components =
+ MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+ req_coord_components) +
+ inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+ (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+ inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+ inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+ (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+ inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
+ inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+ /* SIMD16 messages with more than five arguments exceed the maximum message
+ * size supported by the sampler, regardless of whether a header is
+ * provided or not.
+ */
+ return MIN2(inst->exec_size,
+ num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+}
+
/**
* Get the closest native SIMD width supported by the hardware for instruction
* \p inst. The instruction will be left untouched by
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
- case FS_OPCODE_INTERPOLATE_AT_CENTROID:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
case SHADER_OPCODE_LOD_LOGICAL:
case SHADER_OPCODE_TG4_LOGICAL:
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
- return MIN2(16, inst->exec_size);
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ return get_sampler_lowered_simd_width(devinfo, inst);
case SHADER_OPCODE_TXD_LOGICAL:
/* TXD is unsupported in SIMD16 mode. */
return 8;
- case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
- /* gather4_po_c is unsupported in SIMD16 mode. */
- const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
- return (shadow_c.file != BAD_FILE ? 8 : MIN2(16, inst->exec_size));
- }
case SHADER_OPCODE_TXL_LOGICAL:
- case FS_OPCODE_TXB_LOGICAL: {
- /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
- * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
- * mode because the message exceeds the maximum length of 11.
+ case FS_OPCODE_TXB_LOGICAL:
+ /* Only one execution size is representable pre-ILK depending on whether
+ * the shadow reference argument is present.
*/
- const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
- if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
- return 16;
- else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
- return 8;
+ if (devinfo->gen == 4)
+ return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
else
- return MIN2(16, inst->exec_size);
- }
+ return get_sampler_lowered_simd_width(devinfo, inst);
+
case SHADER_OPCODE_TXF_LOGICAL:
case SHADER_OPCODE_TXS_LOGICAL:
/* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
if (devinfo->gen == 4)
return 16;
else
- return MIN2(16, inst->exec_size);
-
- case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
- /* This opcode can take up to 6 arguments which means that in some
- * circumstances it can end up with a message that is too long in SIMD16
- * mode.
- */
- const unsigned coord_components =
- inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
- /* First three arguments are the sample index and the two arguments for
- * the MCS data.
- */
- if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
- return 8;
- else
- return MIN2(16, inst->exec_size);
- }
+ return get_sampler_lowered_simd_width(devinfo, inst);
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
* have to multiply by VerticesIn to obtain the total storage requirement.
*/
if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
- max_push_components) {
+ max_push_components || gs_prog_data->invocations > 1) {
gs_prog_data->base.include_vue_handles = true;
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
/**
* Return a bitfield where bit n is set if barycentric interpolation mode n
* (see enum brw_barycentric_mode) is needed by the fragment shader.
+ *
+ * We examine the load_barycentric intrinsics rather than looking at input
+ * variables so that we catch interpolateAtCentroid() messages too, which
+ * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
*/
static unsigned
brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
{
unsigned barycentric_interp_modes = 0;
- nir_foreach_variable(var, &shader->inputs) {
- /* Ignore WPOS; it doesn't require interpolation. */
- if (var->data.location == VARYING_SLOT_POS)
+ nir_foreach_function(f, shader) {
+ if (!f->impl)
continue;
- /* Flat inputs don't need barycentric modes. */
- if (var->data.interpolation == INTERP_MODE_FLAT)
- continue;
+ nir_foreach_block(block, f->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
- /* Determine the set (or sets) of barycentric coordinates needed to
- * interpolate this variable. Note that when
- * brw->needs_unlit_centroid_workaround is set, centroid interpolation
- * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
- * for lit pixels, so we need both sets of barycentric coordinates.
- */
- enum brw_barycentric_mode bary_mode =
- barycentric_mode((glsl_interp_mode) var->data.interpolation,
- var->data.centroid, var->data.sample);
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+ continue;
+
+ /* Ignore WPOS; it doesn't require interpolation. */
+ if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
+ continue;
- barycentric_interp_modes |= 1 << bary_mode;
+ intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+ enum glsl_interp_mode interp = (enum glsl_interp_mode)
+ nir_intrinsic_interp_mode(intrin);
+ nir_intrinsic_op bary_op = intrin->intrinsic;
+ enum brw_barycentric_mode bary =
+ brw_barycentric_mode(interp, bary_op);
- if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)
- barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);
+ barycentric_interp_modes |= 1 << bary;
+
+ if (devinfo->needs_unlit_centroid_workaround &&
+ bary_op == nir_intrinsic_load_barycentric_centroid)
+ barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
+ }
+ }
}
return barycentric_interp_modes;
continue;
nir_block *top = nir_start_block(f->impl);
+ exec_node *cursor_node = NULL;
nir_foreach_block(block, f->impl) {
if (block == top)
continue;
- nir_foreach_instr_reverse_safe(instr, block) {
+ nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- switch (intrin->intrinsic) {
- case nir_intrinsic_load_barycentric_pixel:
- case nir_intrinsic_load_barycentric_centroid:
- case nir_intrinsic_load_barycentric_sample:
- break;
- case nir_intrinsic_load_interpolated_input: {
- nir_intrinsic_instr *bary_intrinsic =
- nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
- nir_intrinsic_op op = bary_intrinsic->intrinsic;
-
- /* Leave interpolateAtSample/Offset() where it is. */
- if (op == nir_intrinsic_load_barycentric_at_sample ||
- op == nir_intrinsic_load_barycentric_at_offset)
- continue;
- }
- default:
+ if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+ continue;
+ nir_intrinsic_instr *bary_intrinsic =
+ nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+ nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+ /* Leave interpolateAtSample/Offset() where they are. */
+ if (op == nir_intrinsic_load_barycentric_at_sample ||
+ op == nir_intrinsic_load_barycentric_at_offset)
continue;
- }
- exec_node_remove(&instr->node);
- exec_list_push_head(&top->instr_list, &instr->node);
- instr->block = top;
+ nir_instr *move[3] = {
+ &bary_intrinsic->instr,
+ intrin->src[1].ssa->parent_instr,
+ instr
+ };
+
+ for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+ if (move[i]->block != top) {
+ move[i]->block = top;
+ exec_node_remove(&move[i]->node);
+ if (cursor_node) {
+ exec_node_insert_after(cursor_node, &move[i]->node);
+ } else {
+ exec_list_push_head(&top->instr_list, &move[i]->node);
+ }
+ cursor_node = &move[i]->node;
+ }
+ }
}
}
nir_metadata_preserve(f->impl, (nir_metadata)